In [1]:
from jira import JIRA
import numpy as np
import pandas as pd
import xlsxwriter

import json

with open('jira.json') as json_data_file:
    data = json.load(json_data_file)
    username = data['auth']['username']
    password = data['auth']['password']
    bugqueryadd = data['bugqueryadd']
    epicqueryadd = data['epicqueryadd']
    storyqueryadd = data['storyqueryadd']
    domain = data['domain']
    columns = data['columns']
    fields = data['fields']
    outfile = data['outfile']

if not domain:
    domain = raw_input("Jira Domain (e.g https://XXX:PPP/jira): ")

if not username:
    username = raw_input("Username: ")

if not password:
    password = getpass.getpass("Password: ")

if not columns:
    columns = raw_input("Columns (List of colums): ")

if not fields:
    fields = raw_input("Fields (List of JQL fields): ")

if not bugqueryadd:
    bugqueryadd = raw_input("List of fixversions (no quotes, commas allowed):")
    bugqueryadd = 'fixversion in (' + bugqueryadd + ')'

if not epicqueryadd:
    epicqueryadd = raw_input("List of fixversions (no quotes, commas allowed):")
    epicqueryadd = 'fixversion in (' + epicqueryadd + ')'

if not storyqueryadd:
    storyqueryadd = raw_input("List of fixversions (no quotes, commas allowed):")
    storyqueryadd = 'fixversion in (' + storyqueryadd + ')'
    
def get_jira_client(domain, username, password):
    options = {'server': domain}
    return JIRA(options, basic_auth=(username, password))

def print_jira_issue(issue):
    print (issue['key'], ":", issue['fields']['summary'])
    


In [2]:
writer = pd.ExcelWriter(outfile)

In [3]:
jira = get_jira_client(domain, username, password)

In [4]:
epics = jira.search_issues('type=epic and ' + epicqueryadd, json_result=True, maxResults=1000, fields = fields)

In [5]:
stories = jira.search_issues('type=story and ' + storyqueryadd, json_result=True, maxResults=1000, fields = fields, expand='changelog')

In [None]:
bugs = jira.search_issues('type=bug and ' + bugqueryadd, json_result=True, maxResults=1000, fields = fields)

In [None]:
#prep the stories and epics dataframes
#fix the column names
#extract comment data 
#extract all the history from stories and build all the workflow fields

for issue in stories['issues']:
    #merge the textual fields of comments, summary
    alltext = [comment['body'] for comment in issue['fields']['comment']['comments']]
    if (issue['fields']['summary'] != None):
        alltext.append(issue['fields']['summary'])
    if (issue['fields']['description'] != None):
        alltext.append(issue['fields']['description'])
    try:
        issue['fields']['textinfo'] = ' '.join(alltext)
    except TypeError:
        print(alltext)

    #for stories only, record the important parts of change log as separate columns
    changelog = issue['changelog']
    for history in changelog['histories']:
        for item in history['items']:
            if item['field'] == 'status':
                issue['fields'][item['toString'] + ' ' + 'Set To Date'] = history['created']
                issue['fields'][item['toString'] + ' ' + 'Set By'] = history['author']['name']

for issue in epics['issues']:
    alltext = [comment['body'] for comment in issue['fields']['comment']['comments']]
    alltext.append(issue['fields']['summary'])
    #alltext.append(issue['fields']['description'])
    issue['fields']['textinfo'] = ' '.join(alltext)

epic_list = []
for epic in epics['issues']:
    epic['fields']['key'] = epic['key']
    epic_list.append(epic['fields'])

epics_df = pd.DataFrame(epic_list)

story_list = []
for story in stories['issues']:
    story['fields']['key'] = story['key']
    story_list.append(story['fields'])

stories_df = pd.DataFrame(story_list)

#replacement of custom field's by their names is only done inside the dataframe
# Fetch all fields
allfields=jira.fields()
# Make a map from field name -> field id
nameMap = {field['name']:field['id'] for field in allfields}
idMap = {field['id']:field['name'] for field in allfields}

for column in epics_df.columns:
    if ('custom' in column):
        epics_df.rename(columns={column: idMap[column]}, inplace=True)

for column in stories_df.columns:
    if ('custom' in column):
        stories_df.rename(columns={column: idMap[column]}, inplace=True)

stories_df['Team'] = stories_df['Team'].dropna().apply(lambda x: x[0].get('value') if (type(x) == list) else None)
stories_df['status'] = stories_df['status'].dropna().apply(lambda x: x.get('name'))
stories_df['reporter'] = stories_df['reporter'].dropna().apply(lambda x: x.get('name'))
stories_df['fixVersions'] = stories_df['fixVersions'].dropna().apply(lambda x: x[0].get('name')if (type(x) == dict) else None)
stories_df['Platform'] = stories_df['Platform'].dropna().apply(lambda x: x[0].get('value'))

#Change the string time fields into the python datetime structures

from datetime import datetime
from datetime import timedelta

stories_df['Approval Set To Date'] = pd.to_datetime(stories_df['Approval Set To Date'], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')
stories_df['Closed Set To Date'] = pd.to_datetime(stories_df['Closed Set To Date'], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')
stories_df['Code Review Set To Date'] = pd.to_datetime(stories_df['Code Review Set To Date'], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')
stories_df['In Analysis Set To Date'] = pd.to_datetime(stories_df['In Analysis Set To Date'], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')
stories_df['In Progress Set To Date'] = pd.to_datetime(stories_df['In Progress Set To Date'], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')
stories_df['In UI/UX Set To Date'] = pd.to_datetime(stories_df['In UI/UX Set To Date'], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')
stories_df['Open Set To Date'] = pd.to_datetime(stories_df['Open Set To Date'], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')
stories_df['Ready for Estimation Set To Date'] = pd.to_datetime(stories_df['Ready for Estimation Set To Date'], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')
stories_df['Testing Set To Date'] = pd.to_datetime(stories_df['Testing Set To Date'], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')


In [None]:
#extract the sprint information from the sprints field and create a separate sprints-issue dataframe
#this is only possible once we have the stories dataframe

from functools import reduce

#Takes a list of sprints of the form:
#['com.atlassian.greenhopper.service.sprint.Sprint@1b7eb58a[id=519,rapidViewId=219,state=CLOSED,name=Knight Riders Sprint 2018 - 22,startDate=2018-05-23T21:16:06.149+05:30,endDate=2018-06-05T19:44:00.000+05:30,completeDate=2018-06-06T20:45:27.547+05:30,sequence=519]',
# 'com.atlassian.greenhopper.service.sprint.Sprint@2a28663d[id=542,rapidViewId=219,state=ACTIVE,name=Knight Riders Sprint 2018-23,startDate=2018-06-06T22:14:10.412+05:30,endDate=2018-06-19T20:42:00.000+05:30,completeDate=<null>,sequence=542]']
# and returns one list with a dictionary object for each sprint located. The object also contains the issue key
# the other is 
# we return a dictionary
def getSprintInfo(issueKey, sprint):
    #locate the part in square braces
    start = sprint.find('[') + 1
    end = sprint.find(']', start)
    dict_sprint = dict(x.split('=') for x in sprint[start:end].split(','))
    dict_sprint['issue_key'] = issueKey
    return dict_sprint

#we return a list of dictionaries, where each dictionary is a sprint paired with the issue.
def getSprints (issueKey, sprints):
    if type(sprints) == list:
        return [getSprintInfo(issueKey, sprint) for sprint in sprints]
    else:
        return []

x1 = []
for index, row in stories_df.iterrows():
    x1 = x1 + (getSprints(row['key'], row['Sprint']))

#x1 = scope_df.apply(lambda x: getSprints(x['key_story'], x['Sprint_story']), axis=1).dropna()
#x1

#y = reduce((lambda x, y: x + y), x1)

sprints_df =  pd.DataFrame(x1)
sprints_df['endDate'] = pd.to_datetime(sprints_df['endDate'], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')
sprints_df['startDate'] = pd.to_datetime(sprints_df['startDate'], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')
sprints_df['completeDate'] = pd.to_datetime(sprints_df['completeDate'], format='%Y-%m-%dT%H:%M:%S.%f', errors='coerce')

In [None]:
#prep up the bugs dataframe

bugs_list = []
for bug in bugs['issues']:
    bug['fields']['key'] = bug['key']
    for issuelink in bug['fields']['issuelinks']:
        try:
            if ((issuelink['outwardIssue']['fields']['issuetype']['name'] == 'Story') and 
            ((issuelink['type']['outward'] == 'associated with') or 
             (issuelink['type']['outward'] == 'relates to'))):
                bug['fields']['linkKey'] = issuelink['outwardIssue']['key']
                bug['fields']['linktype'] = issuelink['type']['outward']
        except:
            #print(issuelink)
            if ((issuelink['inwardIssue']['fields']['issuetype']['name'] == 'Story') and 
            ((issuelink['type']['inward'] == 'associated with') or 
             (issuelink['type']['inward'] == 'relates to'))):
                bug['fields']['linkKey'] = issuelink['inwardIssue']['key']
                bug['fields']['linktype'] = issuelink['type']['inward']
    #add each bug to bug list after updating the fields
    bugs_list.append(bug['fields'])
        
bugs_df = pd.DataFrame(bugs_list)

for column in bugs_df.columns:
    if ('custom' in column):
        bugs_df.rename(columns={column: idMap[column]}, inplace=True)

In [None]:
#first merge - create the epics and stories merge
scope_df = pd.merge(epics_df, stories_df, how='right', on=None, left_on='key', right_on='Epic Link',
         left_index=False, right_index=False, sort=True,
         suffixes=('_epic', '_story'), copy=True, indicator=False,
         validate=None)

#insert a column for jira link
scope_df['story_link'] = '=HYPERLINK("' + domain + '/browse/' + scope_df['key_story'] + '","' + scope_df['key_story'] + '")'

In [None]:
#Combine the sprints with the epics and stories dataframe and we can then drop the duplicate issue_key field.

sprintsWithStoriesAndEpics_df = pd.merge(scope_df, sprints_df, how='left', on=None, left_on='key_story', right_on='issue_key',
         left_index=False, right_index=False, 
         suffixes=('_story', '_sprint'),
         copy=True, indicator=False,
         validate=None).drop(columns = ['issue_key'])

In [None]:
#After combining sprints with the stories and epics we can now filter out records where the 
#end date of the sprint was prior to our window of interest

sprintsWithStoriesAndEpics_df = sprintsWithStoriesAndEpics_df[sprintsWithStoriesAndEpics_df['endDate'] > datetime(2018, 4, 3)]
sprintsWithStoriesAndEpics_df = sprintsWithStoriesAndEpics_df[sprintsWithStoriesAndEpics_df['endDate'] < datetime(2018, 7, 5)]

In [None]:
#Lets do some basic statistics
#get the number of unique stories - note that these stories are duplicated because they are part of multiple sprints
#in some cases.
# also this is the stories that were worked on and not necessariy finished. They were simply inside the sprints
sprintsWithStoriesAndEpics_df['key_story'].unique().size

In [None]:
sprintsWithStoriesAndEpics_df['key_epic'].unique().size

In [None]:
#Lets eliminate the stories which are not closed yet and then count the unique stories.
sprintsWithStoriesAndEpics_df = sprintsWithStoriesAndEpics_df[sprintsWithStoriesAndEpics_df['status_story'] == 'Closed']
sprintsWithStoriesAndEpics_df['key_story'].unique().shape

#Note if there is a difference from previous count to check if all stories were closed.

In [None]:
#Calculate the number of stories each team worked on, number of points each team covered, number of bugs
#each team fixed, number of features that were worked on.

#first add up the number of sprints a story is in
#sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_df.drop_duplicates(subset = 'key_story')

sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_df[['Team_story', 'key_story', 'Story Points', 'name']].copy()
sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy.groupby(['Team_story']).agg({'key_story':['count'], 'Story Points':['sum'], 'name':['nunique']})

sprintsWithStoriesAndEpics_dfCopy.columns

sprintsWithStoriesAndEpics_dfCopy['average velocity'] = sprintsWithStoriesAndEpics_dfCopy['Story Points']['sum']/sprintsWithStoriesAndEpics_dfCopy['name']['nunique']

sprintsWithStoriesAndEpics_dfCopy

In [None]:
#Calculate the spillover stories per team

#first add up the number of sprints a story is in
sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_df[['Team_story', 'key_story', 'name', 'startDate', 'Open Set To Date']].copy()

sprintsWithStoriesAndEpics_dfCopy['sprintLeadTime'] = (sprintsWithStoriesAndEpics_dfCopy['startDate'] - sprintsWithStoriesAndEpics_dfCopy['Open Set To Date']).dt.days 
sprintsWithStoriesAndEpics_dfCopy['sprintCommitment'] = sprintsWithStoriesAndEpics_dfCopy['sprintLeadTime'] > -2

#write out the source data onto disk
#however we want to write only the records which are duplicates. Better idea to remove the non duplicates.
sprintsWithStoriesAndEpics_dfCopy[sprintsWithStoriesAndEpics_dfCopy.duplicated(keep=False, subset='key_story')].to_excel(writer, index=False, sheet_name='Spillover Stories', freeze_panes=(1,0), columns=['Team_story', 'key_story', 'name', 'startDate', 'Open Set To Date', 'sprintLeadTime', 'sprintCommitment'])

sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy[sprintsWithStoriesAndEpics_dfCopy['sprintCommitment']].sort_values(by='key_story')

sprintsWithStoriesAndEpics_dfCopy= sprintsWithStoriesAndEpics_dfCopy.drop(columns = ['startDate', 'Open Set To Date', 'sprintCommitment', 'sprintLeadTime'])

sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy.groupby(['Team_story', 'key_story']).agg(['count'])

#reset index since we need to do another groupby
sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy.reset_index()

sprintsWithStoriesAndEpics_dfCopy['spillover sprint count'] = sprintsWithStoriesAndEpics_dfCopy['name']['count']
sprintsWithStoriesAndEpics_dfCopy= sprintsWithStoriesAndEpics_dfCopy.drop(columns = ['name'])
sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy.groupby(['Team_story', 'spillover sprint count']).agg(['count'])
sprintsWithStoriesAndEpics_dfCopy.groupby(level=0).apply(max)
sprintsWithStoriesAndEpics_dfCopy


In [None]:
#reset index since we need to do another groupby
sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy.reset_index()

sprintsWithStoriesAndEpics_dfCopy['story count'] = sprintsWithStoriesAndEpics_dfCopy['key_story']['count']
sprintsWithStoriesAndEpics_dfCopy= sprintsWithStoriesAndEpics_dfCopy.drop(columns = ['key_story'])

In [None]:
#lets calculate the weighted average
sprintsWithStoriesAndEpics_dfCopy.groupby(['Team_story']).apply(lambda g: np.average(g['spillover sprint count'], weights=g['story count']))

In [None]:
#join bugs with sprints to determine how many bugs were attached to sprints and hence part of the relevant period

In [None]:
#find total number of bugs created within sprints and compare with bugs created in total within the period. Note we need 
#to focus on created bugs and not the ones which were fixed.

#also need to compare bugs found during regression with the sprint bugs

#bugs resolved but not closed

#qa and sprints

In [None]:
#combine the bugs with the stories dataframe
#find the number of bugs for each story point

storiesWithBugs_df = pd.merge(bugs_df, stories_df, how='right', on=None, left_on='linkKey', right_on='key',
         left_index=False, right_index=False, sort=True,
         suffixes=('_bug', '_story'), copy=True, indicator=False,
         validate=None)

storiesWithBugs_df = storiesWithBugs_df[['Team_story', 'key_story', 'key_bug', 'Code Review Set By', 'reporter_story',
                                         'Story Points_story']].copy().dropna()
storiesWithBugs_df = storiesWithBugs_df.groupby(['Team_story', 'key_story', 'Code Review Set By', 'reporter_story', 'Story Points_story']).agg(['count'])
storiesWithBugs_df = storiesWithBugs_df.reset_index()
storiesWithBugs_df['bugs per story point'] = storiesWithBugs_df['key_bug']['count']/storiesWithBugs_df['Story Points_story']

#write out the source data onto disk
storiesWithBugs_df.to_excel(writer, index=True, sheet_name='Bugs per Story Point', freeze_panes=(1,0))

storiesWithBugs_df = storiesWithBugs_df.drop(columns = ['Story Points_story', 'key_story', 'key_bug'])
storiesWithBugs_df = storiesWithBugs_df.groupby(['Team_story', 'Code Review Set By', 'reporter_story']).agg(['mean'])
storiesWithBugs_df = storiesWithBugs_df.reset_index()


storiesWithBugs_df['avg bugs per story point'] = storiesWithBugs_df['bugs per story point']['mean']
storiesWithBugs_df= storiesWithBugs_df.drop(columns = ['bugs per story point'])

storiesWithBugs_df = storiesWithBugs_df.sort_values(by='avg bugs per story point', ascending=False)
storiesWithBugs_df

In [None]:
#find the stories which were inserted in sprints after sprints started

sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_df[['Team_story', 'startDate', 'Open Set To Date', 'reporter_story', 'Story Points', 'key_story', 'name']].copy()

sprintsWithStoriesAndEpics_dfCopy['sprintLeadTime'] = (sprintsWithStoriesAndEpics_dfCopy['startDate'] - sprintsWithStoriesAndEpics_dfCopy['Open Set To Date']).dt.days 
sprintsWithStoriesAndEpics_dfCopy['sprintCommitment'] = sprintsWithStoriesAndEpics_dfCopy['sprintLeadTime'] > -2
sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy[sprintsWithStoriesAndEpics_dfCopy['sprintCommitment'] != True].sort_values(by='key_story')
#sprintsWithStoriesAndEpics_dfCopy['key_story'].unique().size

#write out the source data onto disk
#however we want to write only the records which are duplicates. Better idea to remove the non duplicates.
sprintsWithStoriesAndEpics_dfCopy.to_excel(writer, index=False, sheet_name='Late Commitments to Sprint', freeze_panes=(1,0), columns=['Team_story', 'startDate', 'Open Set To Date', 'reporter_story', 'Story Points', 'key_story', 'name', 'sprintLeadTime', 'sprintCommitment'])


sprintsWithStoriesAndEpics_dfCopy

In [None]:
sprintsWithStoriesAndEpics_dfCopy[sprintsWithStoriesAndEpics_dfCopy.duplicated(subset='key_story')]['key_story'].unique().size

In [None]:
#drop unnecessary columns before we do stats
sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy.drop(columns=['startDate', 'Open Set To Date', 'Story Points', 'name', 'sprintCommitment'])
sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy.groupby(['reporter_story', 'Team_story']).agg({'sprintLeadTime':['mean'], 'key_story':['count']})

#we must filter the noise
#sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy.reset_index()
sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy[sprintsWithStoriesAndEpics_dfCopy['key_story']['count'] > 5]
sprintsWithStoriesAndEpics_dfCopy

In [None]:
#changes to description of story after 

In [None]:
#number of bugs found post sprints are over that need to be fixed in release
#bug creation date > end 

In [None]:
#number of issues left in Testing and Testing lead time inside sprint
#find the issues that are still in Testing before the end of their sprint. Only include issues that were committed 
#to in the beginning of the sprint.

sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_df[['Team_story', 'startDate', 'endDate', 'Testing Set To Date', 'Approval Set To Date', 'Approval Set By', 'Open Set To Date', 'key_story']].copy()
sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy.dropna()
sprintsWithStoriesAndEpics_dfCopy['Testing Lead Time'] = (sprintsWithStoriesAndEpics_dfCopy['endDate'] - sprintsWithStoriesAndEpics_dfCopy['Testing Set To Date']).dt.days

sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy[sprintsWithStoriesAndEpics_dfCopy['Testing Lead Time'] >= 2] 
sprintsWithStoriesAndEpics_dfCopy['Ready for Approval Delay'] = (sprintsWithStoriesAndEpics_dfCopy['Approval Set To Date'] - sprintsWithStoriesAndEpics_dfCopy['endDate']).dt.days

sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy[sprintsWithStoriesAndEpics_dfCopy['Ready for Approval Delay'] >= 2]

sprintsWithStoriesAndEpics_dfCopy['sprintLeadTime'] = (sprintsWithStoriesAndEpics_dfCopy['startDate'] - sprintsWithStoriesAndEpics_dfCopy['Open Set To Date']).dt.days 
sprintsWithStoriesAndEpics_dfCopy['sprintCommitment'] = sprintsWithStoriesAndEpics_dfCopy['sprintLeadTime'] > -2
#sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy[sprintsWithStoriesAndEpics_dfCopy['sprintCommitment']].sort_values(by='key_story')
sprintsWithStoriesAndEpics_dfCopy['Testing Time'] = sprintsWithStoriesAndEpics_dfCopy['Testing Lead Time'] + sprintsWithStoriesAndEpics_dfCopy['Ready for Approval Delay']

#write out the source data onto disk
#however we want to write only the records which are duplicates. Better idea to remove the non duplicates.
sprintsWithStoriesAndEpics_dfCopy.to_excel(writer, index=False, sheet_name='Testing Spillovers in Sprints', freeze_panes=(1,0), columns=['Team_story', 'startDate', 'endDate', 'Testing Set To Date', 'Approval Set To Date', 'Approval Set By', 'Open Set To Date', 'key_story', 'Testing Lead Time', 'Ready for Approval Delay', 'Testing Time'])

sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy.sort_values(by='key_story').drop(columns = ['Ready for Approval Delay', 'startDate', 'endDate', 'Testing Set To Date', 'Approval Set To Date', 'Open Set To Date', 'sprintLeadTime', 'sprintCommitment'])



sprintsWithStoriesAndEpics_dfCopy['key_story'].unique().size


In [None]:
sprintsWithStoriesAndEpics_dfCopy.groupby(['Approval Set By', 'Team_story']).agg({'key_story':['count'], 'Testing Lead Time':['mean'], 'Testing Time':['mean']})

In [None]:
#number of issues left in Approval and Approval lead time inside sprint

sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_df[['Team_story', 'startDate', 'endDate', 'Approval Set To Date', 'Approval Set By', 'Closed Set By', 'Open Set To Date', 'Closed Set To Date', 'key_story']].copy()
sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy.dropna()

sprintsWithStoriesAndEpics_dfCopy['Approval Lead Time'] = (sprintsWithStoriesAndEpics_dfCopy['endDate'] - sprintsWithStoriesAndEpics_dfCopy['Approval Set To Date']).dt.days

sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy[sprintsWithStoriesAndEpics_dfCopy['Approval Lead Time'] >= 2] 
sprintsWithStoriesAndEpics_dfCopy['Close Delay'] = (sprintsWithStoriesAndEpics_dfCopy['Closed Set To Date'] - sprintsWithStoriesAndEpics_dfCopy['endDate']).dt.days

sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy[sprintsWithStoriesAndEpics_dfCopy['Close Delay'] >= 2]

sprintsWithStoriesAndEpics_dfCopy['sprintLeadTime'] = (sprintsWithStoriesAndEpics_dfCopy['startDate'] - sprintsWithStoriesAndEpics_dfCopy['Open Set To Date']).dt.days 
sprintsWithStoriesAndEpics_dfCopy['sprintCommitment'] = sprintsWithStoriesAndEpics_dfCopy['sprintLeadTime'] > -2
sprintsWithStoriesAndEpics_dfCopy = sprintsWithStoriesAndEpics_dfCopy[sprintsWithStoriesAndEpics_dfCopy['sprintCommitment']].sort_values(by='key_story')
sprintsWithStoriesAndEpics_dfCopy['Approval Time'] = sprintsWithStoriesAndEpics_dfCopy['Approval Lead Time'] + sprintsWithStoriesAndEpics_dfCopy['Close Delay']

#write out the source data onto disk
#however we want to write only the records which are duplicates. Better idea to remove the non duplicates.
sprintsWithStoriesAndEpics_dfCopy.to_excel(writer, index=False, sheet_name='Approval Spillovers in Sprints', freeze_panes=(1,0), columns=['Team_story', 'startDate', 'endDate', 'Approval Set To Date', 'Approval Set By', 'Closed Set By', 'Open Set To Date', 'Closed Set To Date', 'key_story', 'Approval Lead Time', 'Close Delay', 'Approval Time'])

sprintsWithStoriesAndEpics_dfCopy


In [None]:
sprintsWithStoriesAndEpics_dfCopy.groupby(['Closed Set By', 'Team_story']).agg({'key_story':['count'], 'Approval Lead Time':['mean'], 'Approval Time':['mean']})

In [None]:
#this is a list of strings
#scope_df['textinfo'] = scope_df['textinfo_story'] + scope_df['textinfo_epic']
scope_df['textinfo'] = scope_df['textinfo_story']

In [None]:
scope_df['Invalid AC'] = scope_df['textinfo'].str.contains('Acceptance|AC', case = False, regex = True) == False

#write out the source data onto disk
#however we want to write only the records which are duplicates. Better idea to remove the non duplicates.
scope_df[scope_df['Invalid AC']].to_excel(writer, index=False, sheet_name='Invalid AC', freeze_panes=(1,0), columns=['Team_story', 'key_story', 'reporter_story'])


In [None]:
invalid_ac_df = scope_df[['reporter_story', 'Invalid AC']].copy()

In [None]:
#produce statistics for valid/invalid AC
invalid_ac_df.groupby(['reporter_story']).sum().sort_values(by=['Invalid AC'], ascending=False).head()

In [None]:
writer.save()

In [None]:
dataset1 = pd.DataFrame(scope_df, columns = ['Epic Name', 'textinfo'])
dataset1.dropna(inplace=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

#create document vectors

vectorizer = TfidfVectorizer()
vectors_ds1 = vectorizer.fit_transform(dataset1.textinfo)

In [None]:
#split this into training and test data
from sklearn.model_selection import train_test_split

predictors = vectors_ds1
targets = dataset1['Epic Name']

pred_train, pred_test, tar_train, tar_test  =   train_test_split(predictors, targets, test_size=.20)

In [None]:
clf = MultinomialNB()
clf.fit(pred_train, tar_train)

In [None]:
predictions = clf.predict(pred_test)

In [None]:
import sklearn.metrics

sklearn.metrics.confusion_matrix(tar_test,predictions)
sklearn.metrics.accuracy_score(tar_test, predictions)

In [None]:
pred_train.size

In [None]:
vectors_ds1