## **Upload the dataset**

In [1]:
from google.colab import files
uploaded = files.upload()

Saving nexus_dataframe.csv to nexus_dataframe.csv


## **Reading the dataset. Providing initial descriptives for the project's data. Before cleaning stage.**

In [2]:
import pandas as pd

nexus_df = pd.read_csv("nexus_dataframe.csv")

print("\n******************************\n")
print("Printing Dataset Initial Shape: \n")
print(nexus_df.shape)
print("\n******************************\n")
print("Printing Dataset Initial Descriptives: \n")
print(nexus_df.describe())
print("\n******************************\n")
print("Printing Dataset Information: \n")
print(nexus_df.info())


******************************

Printing Dataset Initial Shape: 

(1268, 24)

******************************

Printing Dataset Initial Descriptives: 

          status.id  watches.watchCount  storypoints
count   1268.000000         1268.000000  1268.000000
mean     401.154574            2.365931     1.518218
std     1952.041890            1.923191     2.238412
min        1.000000            0.000000     0.000000
25%        6.000000            1.000000     0.500000
50%        6.000000            2.000000     1.000000
75%        6.000000            3.000000     2.000000
max    10734.000000           22.000000    40.000000

******************************

Printing Dataset Information: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1268 entries, 0 to 1267
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   assignee.name               951 non-null    object 
 1   components             

In [3]:
print("\n Check the number of total missing values in the Open-Source Project \n")
#To-DO
print(nexus_df.isnull())
print("\n Check the data fields that contain missing values in the Open-Source Project \n")
#To-DO
print(nexus_df.isnull().sum())
print("\n******************************\n")
print("Printing Dataset Total Number of Missing Values: \n")
print(nexus_df.isnull().sum().sum())


 Check the number of total missing values in the Open-Source Project 

      assignee.name  components  created  ...  storypoints  project  sprint
0              True       False    False  ...        False    False    True
1              True       False    False  ...        False    False    True
2              True       False    False  ...        False    False    True
3             False       False    False  ...        False    False   False
4              True       False    False  ...        False    False    True
...             ...         ...      ...  ...          ...      ...     ...
1263          False       False    False  ...        False    False    True
1264          False       False    False  ...        False    False   False
1265          False       False    False  ...        False    False   False
1266           True       False    False  ...        False    False    True
1267          False       False    False  ...        False    False    True

[1268 rows x 24

## **Calculating the number of Sprints, Issues, Developers, Reporters and Issue Statuses Before Cleaning the Data**

In [4]:
print("Number of sprints: \n")
print(nexus_df.sprint.nunique())
print(nexus_df.sprint.unique())
print('\n\n')
print(len(nexus_df.sprint.unique()))

Number of sprints: 

70
[nan 'Sprint 68 - Föhn' 'Sprint 67 - Föhn' 'Sprint 66 - Föhn'
 'Sprint 66 - Chinook' 'Sprint 65 - Chinook' 'Sprint 65 - Föhn'
 'Sprint 64 - Föhn' 'Sprint 63 - Föhn' 'Sprint 62 - Chinook'
 'Sprint 62 - Föhn' 'Sprint 61 - Föhn' 'Sprint 61 - Chinook'
 'Sprint 60 - Chinook' 'Sprint 60 - Föhn' 'Sprint 59 - Föhn' 'Sprint 54'
 'Sprint 57' 'Sprint 58 - Föhn' 'Sprint 56' 'Sprint 55' 'Sprint 53'
 'Sprint 52' 'Sprint 51' 'Sprint 50' 'Sprint 49' 'Sprint 48' 'Sprint 47'
 'Sprint 67 - Chinook' 'Sprint 46' 'Sprint 45' 'Sprint 44' 'Sprint 43'
 'Sprint 42' 'Sprint 41' 'Sprint 40' 'Sprint 39' 'Sprint 38' 'Sprint 37'
 'Sprint 36' 'Sprint 35' 'Sprint 34' 'Sprint 33' 'Sprint 32' 'Sprint 31'
 'Sprint 30' 'Sprint 29' 'Sprint 28' 'Sprint 27' 'Sprint 21' 'Sprint 26'
 'Sprint 25' 'Sprint 24' 'Sprint 23' 'Sprint 22' 'Sprint 20' 'Sprint 19'
 'Sprint 18' 'Sprint 17' 'Sprint 16' 'Sprint 15' 'Sprint 14' 'Sprint 12'
 'Sprint 9' 'Sprint 8' 'Sprint 7' 'Sprint 6' 'Sprint 5' 'Sprint 4'
 'Sprint 3'

In [5]:
print("Number of issues: \n")
print(nexus_df.key.nunique())
print(nexus_df.key.unique())
print('\n\n')
print(len(nexus_df.key.unique()))

Number of issues: 

1071
['NEXUS-10197' 'NEXUS-10191' 'NEXUS-10172' ... 'NEXUS-2911' 'NEXUS-2350'
 'NEXUS-540']



1071


In [None]:
print("Number of developers: \n")
print(nexus_df['assignee.name'].nunique())
print('\n\n')
print(nexus_df['assignee.name'].unique())
print('\n\n')
print(len(nexus_df['assignee.name'].unique()))

In [None]:
print("Number of reporters: \n")
print(nexus_df['reporter.name'].nunique())
print('\n\n')
print(nexus_df['reporter.name'].unique())
print('\n\n')
print(len(nexus_df['reporter.name'].unique()))

In [None]:
print("Number of issue status: \n")
print(nexus_df['status.name'].nunique())
print('\n\n')
print(nexus_df['status.name'].unique())
print('\n\n')
print(len(nexus_df['status.name'].unique()))

## **Exploring the dataset's columns and their missing values! *Before cleaning stage.**

In [None]:
print(f'There are {len(nexus_df.columns)} columns in the dataset.')
print("\n******************************\n")
print("Printing Dataset Columns Below: \n")
print(nexus_df.columns)

In [None]:
print('Printing some data from the dataset for ilustration purpose: \n')
nexus_df.head()

In [None]:
# Checking for missing values within the data fields before cleaning the data!

print(f"\nThere are: {nexus_df['description'].isnull().sum()} missing values in the DESCRIPTION field!\n")
print(f"\nThere are: {nexus_df['summary'].isnull().sum()} missing values in the SUMMARY field!\n")
print(f"\nThere are: {nexus_df['issuetype.name'].isnull().sum()} missing values in the ISSUETYPE.NAME field!\n")
print(f"\nThere are: {nexus_df['key'].isnull().sum()} missing values in the KEY field!\n")
print(f"\nThere are: {nexus_df['sprint'].isnull().sum()} missing values in the SPRINT field!\n")
print(f"\nThere are: {nexus_df['creator.name'].isnull().sum()} missing values in the CREATOR.NAME field!\n")
print(f"\nThere are: {nexus_df['assignee.name'].isnull().sum()} missing values in the ASIGNEE.NAME field!\n")
print(f"\nThere are: {nexus_df['reporter.name'].isnull().sum()} missing values in the REPORTER.NAME field!\n")
print(f"\nThere are: {nexus_df['created'].isnull().sum()} missing values in the CREATED field!\n")
print(f"\nThere are: {nexus_df['resolutiondate'].isnull().sum()} missing values in the RESOLUTIONDATE field!\n")

# **Cleaning the dataset**

## **1. Removing unnecessary and useless data fields!**

In [6]:
# Data cleaning steps

# Remove the unwanted columns: components, fixVersions, issuetype.subtask, versions, watches.watchCount.

nexus_df.drop(columns=['components', 'fixVersions', 'issuetype.subtask', 'versions', 'watches.watchCount', 'resolution.description', 'resolution.name'], inplace = True)

print("\n Columns: components, fixVersions, issuetype.subtask, versions, watches.watchCount, resolution.description, resolution.name have been removed from the dataset! \n")

print(f'\n There are now {len(nexus_df.columns)} columns in the dataset.\n')
print(f"\n Printing Data Fields Below: \n {nexus_df.columns} \n")


 Columns: components, fixVersions, issuetype.subtask, versions, watches.watchCount, resolution.description, resolution.name have been removed from the dataset! 


 There are now 17 columns in the dataset.


 Printing Data Fields Below: 
 Index(['assignee.name', 'created', 'creator.name', 'description',
       'issuetype.name', 'priority.name', 'reporter.name', 'resolutiondate',
       'status.id', 'status.name', 'status.statusCategory.name', 'summary',
       'updated', 'key', 'storypoints', 'project', 'sprint'],
      dtype='object') 



In [None]:
print("Printing the new dataset: \n")

nexus_df.head()

##**2. Removing the duplicate issues, only keeping one instance of them! Removing isses that have same issue_identifier (KEY)!**

In [7]:
#Data cleaning steps

# 1.  Remove duplicated issue reports: it doesn't make sense to have ISSUE REPORTS that 
#     have exactly the same content since it was probably a mistake from data input.

# Select all duplicate rows based on column: key
duplicateRowsDF = nexus_df[nexus_df.duplicated(['key'])]
print(f"\nThere are: {len(duplicateRowsDF)} issue duplicates! \n")


print(f"{nexus_df['key'].value_counts().head(50)} \n\n")
print(f"Count of column: key, before cleaning is: {nexus_df['key'].count()}\n\n")
print(f"Printing some example issues that have the same identifier: \n")
print(nexus_df.loc[nexus_df['key'] == 'XD-1507'])
print(nexus_df.loc[nexus_df['key'] == 'XD-371'])
print(nexus_df.loc[nexus_df['key'] == 'XD-375'])


There are: 197 issue duplicates! 

NEXUS-9024    5
NEXUS-6099    4
NEXUS-7769    4
NEXUS-9163    4
NEXUS-8859    4
NEXUS-7977    4
NEXUS-6045    4
NEXUS-9109    4
NEXUS-8195    3
NEXUS-7632    3
NEXUS-6448    3
NEXUS-7768    3
NEXUS-6286    3
NEXUS-6790    3
NEXUS-7614    3
NEXUS-7581    3
NEXUS-7802    3
NEXUS-7330    3
NEXUS-7765    3
NEXUS-8371    3
NEXUS-8939    3
NEXUS-6027    3
NEXUS-7629    3
NEXUS-9168    2
NEXUS-8343    2
NEXUS-9121    2
NEXUS-8199    2
NEXUS-7732    2
NEXUS-7834    2
NEXUS-9004    2
NEXUS-6767    2
NEXUS-6323    2
NEXUS-6795    2
NEXUS-8080    2
NEXUS-8039    2
NEXUS-8243    2
NEXUS-6108    2
NEXUS-9217    2
NEXUS-8535    2
NEXUS-8588    2
NEXUS-6220    2
NEXUS-6537    2
NEXUS-9870    2
NEXUS-6768    2
NEXUS-6789    2
NEXUS-8533    2
NEXUS-8449    2
NEXUS-8878    2
NEXUS-8697    2
NEXUS-8941    2
Name: key, dtype: int64 


Count of column: key, before cleaning is: 1268


Printing some example issues that have the same identifier: 

Empty DataFrame
Columns: [

In [8]:
# Continuation from previous cell

# Select all duplicate rows based on one column
#duplicateRowsDF = nexus_df[nexus_df.duplicated(['key'])]
#print(f"\n There are: {len(duplicateRowsDF)} issue duplicates! \n")

#print('\n*************************************************\n')

# Here I am dropping all duplicates, while only keeping the first instance!
nexus_df = nexus_df.drop_duplicates(subset='key', keep='first')

print(f"Count of column: key, after cleaning is: {nexus_df['key'].count()}\n\n")
print(f"{nexus_df['key'].value_counts()}\n\n")
print(nexus_df.loc[nexus_df['key'] == 'XD-1507'])
print(nexus_df.loc[nexus_df['key'] == 'XD-371'])
print(nexus_df.loc[nexus_df['key'] == 'XD-375'])


# Here I check for duplicate rows based on all columns, 
# i.e. rows where all columns have same values. 
duplicated = nexus_df[nexus_df.duplicated()]
print(f"\nFinally, there are: {len(duplicated)} duplicates")

Count of column: key, after cleaning is: 1071


NEXUS-6187     1
NEXUS-6824     1
NEXUS-7995     1
NEXUS-3945     1
NEXUS-8291     1
              ..
NEXUS-6030     1
NEXUS-4640     1
NEXUS-6091     1
NEXUS-10102    1
NEXUS-4864     1
Name: key, Length: 1071, dtype: int64


Empty DataFrame
Columns: [assignee.name, created, creator.name, description, issuetype.name, priority.name, reporter.name, resolutiondate, status.id, status.name, status.statusCategory.name, summary, updated, key, storypoints, project, sprint]
Index: []
Empty DataFrame
Columns: [assignee.name, created, creator.name, description, issuetype.name, priority.name, reporter.name, resolutiondate, status.id, status.name, status.statusCategory.name, summary, updated, key, storypoints, project, sprint]
Index: []
Empty DataFrame
Columns: [assignee.name, created, creator.name, description, issuetype.name, priority.name, reporter.name, resolutiondate, status.id, status.name, status.statusCategory.name, summary, updated, key, sto

## **3. Removing issues that do not have a creator.name recorded!**

In [9]:
#Data cleaning steps

# 2.  Remove issue reports with the empty creator field. by default, 
#     all issues have to have a creator in JIRA

print(f"Count of column: creator.name, before cleaning is: {nexus_df['creator.name'].count()}\n")
print(f"Count of missing values in the field: creator.name before cleaning, is: {nexus_df['creator.name'].isnull().sum()}. \n\n\n\n")

nexus_df.dropna(subset = ["creator.name"], inplace=True)

print(f"Count of column: creator.name, after cleaning is: {nexus_df['creator.name'].count()}\n")
print(f"Count of missing values in the field: creator.name after cleaning, is: {nexus_df['creator.name'].isnull().sum()}. \n\n")

print(nexus_df['creator.name'].value_counts())

print('\n\n')
print(nexus_df.loc[(nexus_df['creator.name'] == '')])
print('\n\n')
nexus_df.loc[(nexus_df['creator.name'].isnull())]

Count of column: creator.name, before cleaning is: 1066

Count of missing values in the field: creator.name before cleaning, is: 5. 




Count of column: creator.name, after cleaning is: 1066

Count of missing values in the field: creator.name after cleaning, is: 0. 


jtom               177
plynch             175
rseddon            154
jdillon            114
cstamas             89
                  ... 
alecharp             1
bwawok               1
tobias.oberlies      1
carlspring           1
krulls               1
Name: creator.name, Length: 95, dtype: int64



Empty DataFrame
Columns: [assignee.name, created, creator.name, description, issuetype.name, priority.name, reporter.name, resolutiondate, status.id, status.name, status.statusCategory.name, summary, updated, key, storypoints, project, sprint]
Index: []





Unnamed: 0,assignee.name,created,creator.name,description,issuetype.name,priority.name,reporter.name,resolutiondate,status.id,status.name,status.statusCategory.name,summary,updated,key,storypoints,project,sprint


## **4. Removing empty issues, i.e. issues that do not have a description and summary.**

In [10]:
#Data cleaning steps

# 3.  issue reports with empty title or body: 
#     these are probably data input errors or tests, meaningless reports. 

print(f"Count of column: key, before removing issues with no description is: {nexus_df['key'].count()}\n\n")
print(f"\n There are: {nexus_df['description'].isnull().sum()}  missing values in the DESCRIPTION field! \n")
print(f"\n There are: {nexus_df['summary'].isnull().sum()}  missing values in the SUMMARY field! \n")

#nexus_df[(nexus_df['issuetype.name'].isnull())&nexus_df['description'].isnull()&nexus_df['summary'].isnull()]
#nexus_df[(nexus_df['issuetype.name'].isnull())|nexus_df['description'].isnull()|nexus_df['summary'].isnull()].head(150)

Count of column: key, before removing issues with no description is: 1066



 There are: 29  missing values in the DESCRIPTION field! 


 There are: 0  missing values in the SUMMARY field! 



In [11]:
nexus_df.dropna(subset=['description'], inplace=True)
print(f"Count of column: key, after removing issues with no description is: {nexus_df['key'].count()}\n\n")

Count of column: key, after removing issues with no description is: 1037




In [12]:
nexus_df.dropna(subset=['description', 'summary'], inplace=True)
print(f"Count of column: key, after removing issues with no description (summary too) is: {nexus_df['key'].count()}\n\n")

Count of column: key, after removing issues with no description (summary too) is: 1037




## **5. Removing issues that do not have a ASSIGNEE.NAME recorded!**

In [None]:
#Data cleaning steps

# 2.  Remove issue reports with the empty asignee.name field.

print(f"Count of column: assignee.name, before cleaning is: {nexus_df['assignee.name'].count()}\n")
print(f"Count of missing values in the field: assignee.name before cleaning, is: {nexus_df['assignee.name'].isnull().sum()}. \n\n\n\n")

#nexus_df.dropna(subset = ["assignee.name"], inplace=True)

print(f"Count of column: assignee.name, after cleaning is: {nexus_df['assignee.name'].count()}\n")
print(f"Count of missing values in the field: assignee.name after cleaning, is: {nexus_df['assignee.name'].isnull().sum()}. \n\n")

print(nexus_df['assignee.name'].value_counts())
print('\n\n')
print(nexus_df['assignee.name'].unique())
print('\n\n')
print(nexus_df.loc[(nexus_df['assignee.name'] == '')])
print('\n\n')
nexus_df.loc[(nexus_df['assignee.name'].isnull())]

# **After Cleaning the Data**

## **Calculating the number of Sprints, Issues, Developers, Reporters and Issue Statuses After Cleaning the Data**

In [13]:
print("Number of sprints: \n")
print(nexus_df.sprint.nunique())
print(nexus_df.sprint.unique())
print('\n\n')
print(len(nexus_df.sprint.unique()))

Number of sprints: 

70
[nan 'Sprint 68 - Föhn' 'Sprint 67 - Föhn' 'Sprint 66 - Föhn'
 'Sprint 66 - Chinook' 'Sprint 65 - Chinook' 'Sprint 65 - Föhn'
 'Sprint 64 - Föhn' 'Sprint 63 - Föhn' 'Sprint 62 - Chinook'
 'Sprint 62 - Föhn' 'Sprint 61 - Föhn' 'Sprint 61 - Chinook'
 'Sprint 60 - Chinook' 'Sprint 60 - Föhn' 'Sprint 59 - Föhn' 'Sprint 54'
 'Sprint 57' 'Sprint 58 - Föhn' 'Sprint 56' 'Sprint 55' 'Sprint 53'
 'Sprint 52' 'Sprint 51' 'Sprint 50' 'Sprint 49' 'Sprint 48' 'Sprint 47'
 'Sprint 67 - Chinook' 'Sprint 46' 'Sprint 45' 'Sprint 44' 'Sprint 43'
 'Sprint 42' 'Sprint 41' 'Sprint 40' 'Sprint 39' 'Sprint 38' 'Sprint 37'
 'Sprint 36' 'Sprint 35' 'Sprint 34' 'Sprint 33' 'Sprint 32' 'Sprint 31'
 'Sprint 30' 'Sprint 29' 'Sprint 28' 'Sprint 27' 'Sprint 21' 'Sprint 26'
 'Sprint 25' 'Sprint 24' 'Sprint 23' 'Sprint 22' 'Sprint 20' 'Sprint 19'
 'Sprint 18' 'Sprint 17' 'Sprint 16' 'Sprint 15' 'Sprint 14' 'Sprint 12'
 'Sprint 9' 'Sprint 8' 'Sprint 7' 'Sprint 6' 'Sprint 5' 'Sprint 4'
 'Sprint 3'

In [14]:
print("Number of issues: \n")
print(nexus_df.key.nunique())
print(nexus_df.key.unique())
print('\n\n')
print(len(nexus_df.key.unique()))

Number of issues: 

1037
['NEXUS-10197' 'NEXUS-10191' 'NEXUS-10172' ... 'NEXUS-3201' 'NEXUS-3119'
 'NEXUS-2911']



1037


In [15]:
print("Number of developers: \n")
print(nexus_df['assignee.name'].nunique())
print('\n\n')
print(nexus_df['assignee.name'].unique())
print('\n\n')
print(len(nexus_df['assignee.name'].unique()))

Number of developers: 

22



[nan 'jtom' 'dwallace' 'fmilens' 'krobinson' 'mprescott' 'cstamas' 'alin'
 'plynch' 'simpligility' 'dsauble' 'shenty' 'msurani' 'rseddon' 'bradbeck'
 'jdillon' 'cwilper' 'scarlucci' 'bhanzelmann' 'velo' 'mcculls' 'bdemers'
 'dbradicich']



23


In [None]:
print("Number of reporters: \n")
print(nexus_df['reporter.name'].nunique())
print('\n\n')
print(nexus_df['reporter.name'].unique())
print('\n\n')
print(len(nexus_df['reporter.name'].unique()))

## **Exploring the dataset after cleaning (General Descriptives, etc.)**

In [None]:
print("\n******************************\n")
print("Printing Dataset Shape After Cleaning: \n")
print(nexus_df.shape)
print("\n******************************\n")
print("Printing Dataset Descriptives After Cleaning: \n")
print(nexus_df.describe())
print("\n******************************\n")
print("Printing Dataset Information After Cleaning: \n")
print(nexus_df.info())

## **Analyzing the cleaned data below (Missing Values)**

In [None]:
print("\nCheck the number of total missing values in the Open-Source Project \n")
print(nexus_df.isnull())
print("\n\n******************************\n")
print("\nCheck the data fields that contain missing values in the Open-Source Project \n")
print(nexus_df.isnull().sum())
print("\n******************************\n")
print("Printing Dataset Total Number of Missing Values: \n")
print(nexus_df.isnull().sum().sum())

In [None]:
# Checking for missing values after cleaning the data!

print(f"\nThere are: {nexus_df['description'].isnull().sum()} missing values in the DESCRIPTION field!\n")
print(f"\nThere are: {nexus_df['summary'].isnull().sum()} missing values in the SUMMARY field!\n")
print(f"\nThere are: {nexus_df['issuetype.name'].isnull().sum()} missing values in the ISSUETYPE.NAME field!\n")
print(f"\nThere are: {nexus_df['key'].isnull().sum()} missing values in the KEY field!\n")
print(f"\nThere are: {nexus_df['sprint'].isnull().sum()} missing values in the SPRINT field!\n")
print(f"\nThere are: {nexus_df['creator.name'].isnull().sum()} missing values in the CREATOR.NAME field!\n")
print(f"\nThere are: {nexus_df['assignee.name'].isnull().sum()} missing values in the ASIGNEE.NAME field!\n")
print(f"\nThere are: {nexus_df['reporter.name'].isnull().sum()} missing values in the REPORTER.NAME field!\n")
print(f"\nThere are: {nexus_df['created'].isnull().sum()} missing values in the CREATED field!\n")
print(f"\nThere are: {nexus_df['resolutiondate'].isnull().sum()} missing values in the RESOLUTIONDATE field!\n")

## **Checking for Scrum Rules. After cleaning stage**

In [None]:
print("\n Check the unique developers (usernames) in the Open-Source Project \n")
#Count and report the unique values

print(len(nexus_df['assignee.name'].unique()))
print('\n****************************************************\n')
nexus_df['assignee.name'].unique()

In [None]:
print("\n Check the unique reporters (Product Owners) in the Open-Source Project \n")
#Count and report the unique values

print(len(nexus_df['reporter.name'].unique()))
print('\n****************************************************\n')
nexus_df['reporter.name'].unique()

In [None]:
print("\n Check the unique creators in the Open-Source Project \n")
#Count and report the unique values

print(len(nexus_df['creator.name'].unique()))
print('\n****************************************************\n')
nexus_df['creator.name'].unique()

In [None]:
print("\n Check the total number of issues in the Open-Source Project \n")

print(nexus_df['key'].count())
#nexus_df[nexus_df.key[0]].count()
#nexus_df.count(0)

In [None]:
print("\n Check the total number of sprints in the Open-Source Project \n")

print(nexus_df['sprint'].count())
#nexus_df[nexus_df.key[0]].count()
#nexus_df.count(0)

In [None]:
print("\n******************************\n")
print("Printing Project Unique Story Points and their Count: \n")
print(nexus_df['storypoints'].value_counts())

#print("\n******************************\n")
#print("Printing the row (issue) where the number of story points is 15 \n")
#print(nexus_df.loc[nexus_df['storypoints'] == 15])

In [None]:
print("\n Exploring the data field: Created, i.e. date when the issues in this project were created. \n")
print(nexus_df['created'].head(1))
print(nexus_df['created'].value_counts())

In [None]:
print("\n Exploring the data field: Resolutiondate, i.e. date when the issues in this project were resolved. \n")
print(len(nexus_df['resolutiondate']))
print(nexus_df['resolutiondate'].tail(5))
print(nexus_df['resolutiondate'].value_counts())

print('\n\n')
nexus_df['resolutiondate'].tail(20)

In [None]:
print("\n Exploring the data field: Updated, i.e. date when the issues in this project were updated. \n")
print(nexus_df['updated'])
print(nexus_df['updated'].value_counts())

## **Working with DATES!**

In [None]:
nexus_df.info()

In [None]:
#Converting the dates to pandas datetime format, which will be easier to process.

nexus_df['created'] = pd.to_datetime(nexus_df.created)

In [None]:
nexus_df.head()

In [None]:
nexus_df.dtypes

In [None]:
nexus_df.created.dt.dayofyear 

In [None]:

nexus_df['resolutiondate'] = pd.to_datetime(nexus_df.resolutiondate)

In [None]:
nexus_df.resolutiondate.dt.dayofyear 