## **Upload the dataset**

In [None]:
from google.colab import files
uploaded = files.upload()

## **Reading the dataset. Providing initial descriptives for the project's data. Before cleaning stage.**

In [None]:
import pandas as pd

apstud_df = pd.read_csv("apstud_dataframe.csv")

print("\n******************************\n")
print("Printing Dataset Initial Shape: \n")
print(apstud_df.shape)
print("\n******************************\n")
print("Printing Dataset Initial Descriptives: \n")
print(apstud_df.describe())
print("\n******************************\n")
print("Printing Dataset Information: \n")
print(apstud_df.info())

In [None]:
print("\n Check the number of total missing values in the Open-Source Project \n")
#To-DO
print(apstud_df.isnull())
print("\n Check the data fields that contain missing values in the Open-Source Project \n")
#To-DO
print(apstud_df.isnull().sum())
print("\n******************************\n")
print("Printing Dataset Total Number of Missing Values: \n")
print(apstud_df.isnull().sum().sum())

## **Calculating the number of Sprints, Issues, Developers, Reporters and Issue Statuses Before Cleaning the Data**

In [None]:
print("Number of sprints: \n")
print(apstud_df.sprint.nunique())
print(apstud_df.sprint.unique())
print('\n\n')
print(len(apstud_df.sprint.unique()))

In [None]:
print("Number of issues: \n")
print(apstud_df.key.nunique())
print(apstud_df.key.unique())
print('\n\n')
print(len(apstud_df.key.unique()))

In [None]:
print("Number of developers: \n")
print(apstud_df['assignee.name'].nunique())
print('\n\n')
print(apstud_df['assignee.name'].unique())
print('\n\n')
print(len(apstud_df['assignee.name'].unique()))

In [None]:
print("Number of reporters: \n")
print(apstud_df['reporter.name'].nunique())
print('\n\n')
print(apstud_df['reporter.name'].unique())
print('\n\n')
print(len(apstud_df['reporter.name'].unique()))

In [None]:
print("Number of issue status: \n")
print(apstud_df['status.name'].nunique())
print('\n\n')
print(apstud_df['status.name'].unique())
print('\n\n')
print(len(apstud_df['status.name'].unique()))

## **Exploring the dataset's columns and their missing values! *Before cleaning stage.**

In [None]:
print(f'There are {len(apstud_df.columns)} columns in the dataset.')
print("\n******************************\n")
print("Printing Dataset Columns Below: \n")
print(apstud_df.columns)

In [None]:
print('Printing some data from the dataset for ilustration purpose: \n')
apstud_df.head()

In [None]:
# Checking for missing values within the data fields before cleaning the data!

print(f"\nThere are: {apstud_df['description'].isnull().sum()} missing values in the DESCRIPTION field!\n")
print(f"\nThere are: {apstud_df['summary'].isnull().sum()} missing values in the SUMMARY field!\n")
print(f"\nThere are: {apstud_df['issuetype.name'].isnull().sum()} missing values in the ISSUETYPE.NAME field!\n")
print(f"\nThere are: {apstud_df['key'].isnull().sum()} missing values in the KEY field!\n")
print(f"\nThere are: {apstud_df['sprint'].isnull().sum()} missing values in the SPRINT field!\n")
print(f"\nThere are: {apstud_df['creator.name'].isnull().sum()} missing values in the CREATOR.NAME field!\n")
print(f"\nThere are: {apstud_df['assignee.name'].isnull().sum()} missing values in the ASIGNEE.NAME field!\n")
print(f"\nThere are: {apstud_df['reporter.name'].isnull().sum()} missing values in the REPORTER.NAME field!\n")
print(f"\nThere are: {apstud_df['created'].isnull().sum()} missing values in the CREATED field!\n")
print(f"\nThere are: {apstud_df['resolutiondate'].isnull().sum()} missing values in the RESOLUTIONDATE field!\n")

# **Cleaning the dataset**

## **1. Removing unnecessary and useless data fields!**

In [None]:
# Data cleaning steps

# Remove the unwanted columns: components, fixVersions, issuetype.subtask, versions, watches.watchCount.

apstud_df.drop(columns=['components', 'fixVersions', 'issuetype.subtask', 'versions', 'watches.watchCount', 'resolution.description', 'resolution.name'], inplace = True)

print("\n Columns: components, fixVersions, issuetype.subtask, versions, watches.watchCount, resolution.description, resolution.name have been removed from the dataset! \n")

print(f'\n There are now {len(apstud_df.columns)} columns in the dataset.\n')
print(f"\n Printing Data Fields Below: \n {apstud_df.columns} \n")

In [None]:
print("Printing the new dataset: \n")

apstud_df.head(30)

##**2. Removing the duplicate issues, only keeping one instance of them! Removing isses that have same issue_identifier (KEY)!**

In [None]:
#Data cleaning steps

# 1.  Remove duplicated issue reports: it doesn't make sense to have ISSUE REPORTS that 
#     have exactly the same content since it was probably a mistake from data input.

# Select all duplicate rows based on column: key
duplicateRowsDF = apstud_df[apstud_df.duplicated(['key'])]
print(f"\nThere are: {len(duplicateRowsDF)} issue duplicates! \n")


print(f"{apstud_df['key'].value_counts()} \n\n")
print(f"Count of column: key, before cleaning is: {apstud_df['key'].count()}\n\n")
print(f"Printing some example issues that have the same identifier: \n")

In [None]:
# Continuation from previous cell

# Select all duplicate rows based on one column
#duplicateRowsDF = apstud_df[apstud_df.duplicated(['key'])]
#print(f"\n There are: {len(duplicateRowsDF)} issue duplicates! \n")

#print('\n*************************************************\n')

# Here I am dropping all duplicates, while only keeping the first instance!
apstud_df = apstud_df.drop_duplicates(subset='key', keep='first')

print(f"Count of column: key, after cleaning is: {apstud_df['key'].count()}\n\n")
print(f"{apstud_df['key'].value_counts()}\n\n")


# Here I check for duplicate rows based on all columns, 
# i.e. rows where all columns have same values. 
duplicated = apstud_df[apstud_df.duplicated()]
print(f"\nFinally, there are: {len(duplicated)} duplicates")

## **3. Removing issues that do not have a creator.name recorded!**

In [None]:
#Data cleaning steps

# 2.  Remove issue reports with the empty creator field. by default, 
#     all issues have to have a creator in JIRA

print(f"Count of column: creator.name, before cleaning is: {apstud_df['creator.name'].count()}\n")
print(f"Count of missing values in the field: creator.name before cleaning, is: {apstud_df['creator.name'].isnull().sum()}. \n\n\n\n")

apstud_df.dropna(subset = ["creator.name"], inplace=True)

print(f"Count of column: creator.name, after cleaning is: {apstud_df['creator.name'].count()}\n")
print(f"Count of missing values in the field: creator.name after cleaning, is: {apstud_df['creator.name'].isnull().sum()}. \n\n")

print(apstud_df['creator.name'].value_counts())

print('\n\n')
print(apstud_df.loc[(apstud_df['creator.name'] == '')])
print('\n\n')
apstud_df.loc[(apstud_df['creator.name'].isnull())]

## **4. Removing empty issues, i.e. issues that do not have a description and summary.**

In [None]:
#Data cleaning steps

# 3.  issue reports with empty title or body: 
#     these are probably data input errors or tests, meaningless reports. 

print(f"Count of column: key, before removing issues with no description is: {apstud_df['key'].count()}\n\n")
print(f"\n There are: {apstud_df['description'].isnull().sum()}  missing values in the DESCRIPTION field! \n")
print(f"\n There are: {apstud_df['summary'].isnull().sum()}  missing values in the SUMMARY field! \n")

#apstud_df[(apstud_df['issuetype.name'].isnull())&apstud_df['description'].isnull()&apstud_df['summary'].isnull()]
#apstud_df[(apstud_df['issuetype.name'].isnull())|apstud_df['description'].isnull()|apstud_df['summary'].isnull()].head(150)

In [None]:
apstud_df.dropna(subset=['description'], inplace=True)
print(f"Count of column: key, after removing issues with no description is: {apstud_df['key'].count()}\n\n")

In [None]:
apstud_df.dropna(subset=['description', 'summary'], inplace=True)
print(f"Count of column: key, after removing issues with no description (summary too) is: {apstud_df['key'].count()}\n\n")

## **5. Removing issues that do not have a ASSIGNEE.NAME recorded!**

In [None]:
#Data cleaning steps

# 2.  Remove issue reports with the empty asignee.name field.

print(f"Count of column: assignee.name, before cleaning is: {apstud_df['assignee.name'].count()}\n")
print(f"Count of missing values in the field: assignee.name before cleaning, is: {apstud_df['assignee.name'].isnull().sum()}. \n\n\n\n")

#apstud_df.dropna(subset = ["assignee.name"], inplace=True)

print(f"Count of column: assignee.name, after cleaning is: {apstud_df['assignee.name'].count()}\n")
print(f"Count of missing values in the field: assignee.name after cleaning, is: {apstud_df['assignee.name'].isnull().sum()}. \n\n")

print(apstud_df['assignee.name'].value_counts())
print('\n\n')
print(apstud_df['assignee.name'].unique())
print('\n\n')
print(apstud_df.loc[(apstud_df['assignee.name'] == '')])
print('\n\n')
apstud_df.loc[(apstud_df['assignee.name'].isnull())]

# **After Cleaning the Data**

## **Calculating the number of Sprints, Issues, Developers, Reporters and Issue Statuses After Cleaning the Data**

In [None]:
print("Number of sprints: \n")
print(apstud_df.sprint.nunique())
print(apstud_df.sprint.unique())
print('\n\n')
print(len(apstud_df.sprint.unique()))

In [None]:
print("Number of issues: \n")
print(apstud_df.key.nunique())
print(apstud_df.key.unique())
print('\n\n')
print(len(apstud_df.key.unique()))

In [None]:
print("Number of developers: \n")
print(apstud_df['assignee.name'].nunique())
print('\n\n')
print(apstud_df['assignee.name'].unique())
print('\n\n')
print(len(apstud_df['assignee.name'].unique()))

In [None]:
print("Number of reporters: \n")
print(apstud_df['reporter.name'].nunique())
print('\n\n')
print(apstud_df['reporter.name'].unique())
print('\n\n')
print(len(apstud_df['reporter.name'].unique()))

## **Exploring the dataset after cleaning (General Descriptives, etc.)**

In [None]:
print("\n******************************\n")
print("Printing Dataset Shape After Cleaning: \n")
print(apstud_df.shape)
print("\n******************************\n")
print("Printing Dataset Descriptives After Cleaning: \n")
print(apstud_df.describe())
print("\n******************************\n")
print("Printing Dataset Information After Cleaning: \n")
print(apstud_df.info())

## **Analyzing the cleaned data below (Missing Values)**

In [None]:
print("\nCheck the number of total missing values in the Open-Source Project \n")
print(apstud_df.isnull())
print("\n\n******************************\n")
print("\nCheck the data fields that contain missing values in the Open-Source Project \n")
print(apstud_df.isnull().sum())
print("\n******************************\n")
print("Printing Dataset Total Number of Missing Values: \n")
print(apstud_df.isnull().sum().sum())

In [None]:
# Checking for missing values after cleaning the data!

print(f"\nThere are: {apstud_df['description'].isnull().sum()} missing values in the DESCRIPTION field!\n")
print(f"\nThere are: {apstud_df['summary'].isnull().sum()} missing values in the SUMMARY field!\n")
print(f"\nThere are: {apstud_df['issuetype.name'].isnull().sum()} missing values in the ISSUETYPE.NAME field!\n")
print(f"\nThere are: {apstud_df['key'].isnull().sum()} missing values in the KEY field!\n")
print(f"\nThere are: {apstud_df['sprint'].isnull().sum()} missing values in the SPRINT field!\n")
print(f"\nThere are: {apstud_df['creator.name'].isnull().sum()} missing values in the CREATOR.NAME field!\n")
print(f"\nThere are: {apstud_df['assignee.name'].isnull().sum()} missing values in the ASIGNEE.NAME field!\n")
print(f"\nThere are: {apstud_df['reporter.name'].isnull().sum()} missing values in the REPORTER.NAME field!\n")
print(f"\nThere are: {apstud_df['created'].isnull().sum()} missing values in the CREATED field!\n")
print(f"\nThere are: {apstud_df['resolutiondate'].isnull().sum()} missing values in the RESOLUTIONDATE field!\n")

## **Checking for Scrum Rules. After cleaning stage**

In [None]:
print("\n Check the unique developers (usernames) in the Open-Source Project \n")
#Count and report the unique values

print(len(apstud_df['assignee.name'].unique()))
print('\n****************************************************\n')
apstud_df['assignee.name'].unique()

In [None]:
print("\n Check the unique reporters (Product Owners) in the Open-Source Project \n")
#Count and report the unique values

print(len(apstud_df['reporter.name'].unique()))
print('\n****************************************************\n')
apstud_df['reporter.name'].unique()

In [None]:
print("\n Check the unique creators in the Open-Source Project \n")
#Count and report the unique values

print(len(apstud_df['creator.name'].unique()))
print('\n****************************************************\n')
apstud_df['creator.name'].unique()

In [None]:
print("\n Check the total number of issues in the Open-Source Project \n")

print(apstud_df['key'].count())
#apstud_df[apstud_df.key[0]].count()
#apstud_df.count(0)

In [None]:
print("\n Check the total number of sprints in the Open-Source Project \n")

print(apstud_df['sprint'].count())
#apstud_df[apstud_df.key[0]].count()
#apstud_df.count(0)

In [None]:
print("\n******************************\n")
print("Printing Project Unique Story Points and their Count: \n")
print(apstud_df['storypoints'].value_counts())

#print("\n******************************\n")
#print("Printing the row (issue) where the number of story points is 15 \n")
#print(apstud_df.loc[apstud_df['storypoints'] == 15])

In [None]:
print("\n Exploring the data field: Created, i.e. date when the issues in this project were created. \n")
print(apstud_df['created'].head(1))
print(apstud_df['created'].value_counts())

In [None]:
print("\n Exploring the data field: Resolutiondate, i.e. date when the issues in this project were resolved. \n")
print(len(apstud_df['resolutiondate']))
print(apstud_df['resolutiondate'].tail(5))
print(apstud_df['resolutiondate'].value_counts())

print('\n\n')
apstud_df['resolutiondate'].tail(20)

In [None]:
print("\n Exploring the data field: Updated, i.e. date when the issues in this project were updated. \n")
print(apstud_df['updated'])
print(apstud_df['updated'].value_counts())

## **Working with DATES!**

In [None]:
apstud_df.info()

In [None]:
#Converting the dates to pandas datetime format, which will be easier to process.

apstud_df['created'] = pd.to_datetime(apstud_df.created)

In [None]:
apstud_df.head()

In [None]:
apstud_df.dtypes

In [None]:
apstud_df.created.dt.dayofyear 

In [None]:

apstud_df['resolutiondate'] = pd.to_datetime(apstud_df.resolutiondate)

In [None]:
apstud_df.resolutiondate.dt.dayofyear 