# APO Work Items - Wrangle

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

## Assess

### Workitems

In [None]:
workitems = pd.read_csv('workitems.csv')
workitems.head()

In [None]:
workitems.info()

In [None]:
workitems.created_date = pd.to_datetime(workitems.created_date)

workitems.info()

In [None]:
workitems.estimate.hist()

In [None]:
workitems[['created_date', 'estimate']].describe(include = 'all')

In [None]:
workitems[workitems.id.duplicated()]

In [None]:
workitems[workitems.estimate == 0]

In [None]:
workitems[workitems.estimate > 50]

In [None]:
workitems.last_state.value_counts()

In [None]:
workitems.workitem_type.value_counts()

In [None]:
workitems.iteration_path.value_counts().sort_index()

In [None]:
workitems[workitems.iteration_path.str.contains('Queue')]

In [None]:
workitems.assigned_to.value_counts()

In [None]:
workitems[workitems.assigned_to.isna()]

### Revisions

In [None]:
revisions = pd.read_csv('revisions.csv')

In [None]:
revisions.info()

In [None]:
revisions.describe()

In [None]:
revisions[revisions.id == 1733729]

In [None]:
revisions[revisions.id == 887472]

In [None]:
revisions[revisions.state == 'Active'][['id', 'state']].nunique()

#### Notes

**Quality**

* zero estimates
* missing assigned_to
* simplify assigned_to
* non standard workitem types
* remove rows with last_state not closed or resolved
* workitems in queue are included
* workitems assigned to not part of the team
* workitems not passed to active state

**Tidiness**

* extract dates from iteration path
* remove created date in workitems and replace it with what in revisions
* remove assigned to in workitems and replace it with what in revisions
* add important states on workitems


## Clean

In [None]:
workitems_clean = workitems.copy()

### Quality

#### Zero Estimates

*Set Zero Estimates to np.NaN*

Zero estimates means no effort, and it's impossible to have them. Set them to `np.NaN`

**Code**

In [None]:
workitems_clean.loc[workitems_clean.estimate == 0, 'estimate'] = np.NaN

**Test**

In [None]:
workitems_clean.estimate.value_counts()

In [None]:
workitems_clean.info()

In [None]:
workitems_clean.estimate.hist(bins=50)

#### Non standard workitem types
*Standardize naming for work item types*

Names are different from the first part of work items and the second part, to standardize name we need to change:
* Change Request -> Story

**Code**

In [None]:
workitems_clean.loc[workitems_clean.workitem_type == 'Change Request', 'workitem_type']  = 'Story'

**Test**

In [None]:
workitems_clean.workitem_type.value_counts()

#### Workitems rows with last_state not closed or resolved

*Remove last_state rows not closed or resolved* 

**Code**

In [None]:
workitems_clean = workitems_clean[workitems_clean.last_state.isin(['Closed', 'Resolved'])]

**Test**

In [None]:
workitems_clean.last_state.value_counts()

#### Workitems in queue are included

*There workitems that are in Queue or Backlog in iteration path. Those means that they are not worked on therefore we need to delete them*

**Code**

In [None]:
workitems_clean[workitems_clean.iteration_path.str.contains('Queue') | workitems_clean.iteration_path.str.contains('Backlog') ]

In [None]:
workitems_clean = workitems_clean[~(workitems_clean.iteration_path.str.contains('Queue') | workitems_clean.iteration_path.str.contains('Backlog'))]

**Test**

In [None]:
len(workitems_clean[workitems_clean.iteration_path.str.contains('Queue') | workitems_clean.iteration_path.str.contains('Backlog')])

In [None]:
workitems_clean.info()

### Tidiness

#### Remove columns retrievable from revisions

*remove the columns, `assigned_to`, `created_date`, `last_state`*

**Code**

In [None]:
workitems_clean = workitems_clean[['id', 'workitem_type', 'estimate', 'iteration_path', 'title']]

**Test**

In [None]:
workitems_clean.info()

In [None]:
workitems_clean.head()

#### Dates in iteration path

*Extract dates in iteration path and move them to `iter_year`, `iter_num`, `iter_start` and `iter_end`*

**Code**

In [None]:
workitems_clean.iteration_path.value_counts()

In [None]:
#workitems.iteration_path.str.extract(r'NCA\\APO\\(?P<iter_year>\d{4})\\Iteration\s(?P<iter>\d+)\s\((?P<iter_startday>\d+)(?P<iter_startmon>\w{3})-(?P<iter_endday>\d+)(?P<iter_endmon>\w{3})\)')
workitems_clean[['iteration_path']] = workitems_clean.iteration_path.str.replace(r'NCA\\*', '', regex = True)
workitems_clean[['iteration_path']] = workitems_clean.iteration_path.str.replace(r'APO\\*', '', regex = True)
workitems_clean.head()

In [None]:
pat = r'^(?P<year>\d+)'
workitems_clean[['year']] = workitems_clean.iteration_path.str.extract(pat)
rep = r'^\d+\\*'
workitems_clean[['iteration_path']] = workitems_clean.iteration_path.str.replace(rep, '')
workitems_clean[['iteration_path', 'year']]

In [None]:
rep = r'^Iteration\s'
workitems_clean[['iteration_path']] = workitems_clean.iteration_path.str.replace(rep, '')

In [None]:
rep = r'^\d\.\d+\\Sprint\s'
workitems_clean[['iteration_path']] = workitems_clean.iteration_path.str.replace(rep, '')

In [None]:
workitems_clean.loc[workitems_clean.iteration_path == '1.12', 'iteration_path'] = ''

In [None]:
pat = r'^(?P<sprint>\d+)(?:\s*\((?P<start_day>\d+)\s*(?P<start_month>[a-zA-Z]+)(?:\s*-*\s*(?P<end_day>\d+)-*(?P<end_month>[a-zA-Z]+))*)*'
workitems_clean[['sprint', 'start_day', 'start_month', 'end_day', 'end_month']] = workitems_clean.iteration_path.str.extract(pat)

**Test**

In [None]:
workitems_clean.head()

In [None]:
workitems_clean.info()

#### Title not analysis friendly

*Tokenize the title and remove articles from the title and store it as comma delimited list*

**Code**

In [None]:
workitems_clean.title = workitems_clean.title.str.replace('\[.*\]', '')

In [None]:
def cat(words):
    forbidden = ['the', 'can', 'for', 'not', 'out', 'and', 'for', 'all', 'must', 'are', 'does', 'more', 'has', 'only', 'how', 'when', 'should', 'from']
    result = [word for word in words if len(word) > 2 and word not in forbidden]
    result.sort()
    result = np.unique(result)
    return ",".join(result)
    
workitems_clean['words'] = workitems_clean.title.str.lower().str.split('[\W_]+').apply(cat)
workitems_clean.drop(['title'], axis = 1, inplace = True)

**Test**

In [None]:
workitems_clean.head()

In [None]:
workitems_clean.words