# Analytics

#### Date: 2020/02

#### SUMMARY:

- This notebook represents the project quality analysis of the date exposed right above. 

### TEAM:

##### Semester: 2020/02
##### Professor: Hilmer Neri

##### Members:

- Member x
- Member y

### LIBRARIES

In [155]:
# Deal with data
import pandas as pd
import numpy as np
import json
from glob import glob
import os

# Deal with API request
import urllib3
from urllib3 import request

# Deal with visualization
import seaborn as sns
import matplotlib.pyplot as plt

### GRAPH SETTINGS

In [156]:
# %config InlineBackend.figure_format ='retina'
# sns.set(font_scale=1.5)
# sns.set_style('darkgrid',
#               {'xtick.bottom' : True,
#                'ytick.left': True,
#                'grid.linestyle':'--',
#                'font.monospace': ['Computer Modern Typewriter'],
#                'axes.edgecolor' : 'white'})

### DATAFRAME SETTINGS

In [157]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

### SonarCloud

##### repo names, lenguage and path to the folder with all your jsons

In [158]:
repos = ['frontend', 'gateway', 'etl_tse', 'etl_twitter', 'etl_camara', 'etl_news']
language = [['frontend', 'js'], 
            ['gateway', 'py'], 
            ['etl_tse', 'py'], 
            ['etl_twitter', 'py'], 
            ['etl_camara', 'py'], 
            ['etl_news', 'py']]

repos_lenguage = {}

for item in language:
    repos_lenguage[f"{item[0]}"] = item[1]
    

In [159]:
repos_lenguage

{'frontend': 'js',
 'gateway': 'py',
 'etl_tse': 'py',
 'etl_twitter': 'py',
 'etl_camara': 'py',
 'etl_news': 'py'}

In [160]:
jsons = glob('analytics-raw-data/*.json') # add the path here

In [161]:
jsons

['analytics-raw-data/fga-eps-mds-2020_2-etl_tse-09-05-2021(2).json',
 'analytics-raw-data/fga-eps-mds-2020_2-etl_camara-09-05-2021(2).json',
 'analytics-raw-data/fga-eps-mds-2020_2-gateway-04-05-2021.json',
 'analytics-raw-data/fga-eps-mds-2020_2-etl_twitter-02-05-2021.json',
 'analytics-raw-data/fga-eps-mds-2020_2-etl_news-18-04-2021.json',
 'analytics-raw-data/fga-eps-mds-2020_2-etl_camara-09-05-2021.json',
 'analytics-raw-data/fga-eps-mds-2020_2-frontend-09-05-2021(2).json',
 'analytics-raw-data/fga-eps-mds-2020_2-etl_news-02-05-2021.json',
 'analytics-raw-data/fga-eps-mds-2020_2-gateway-10-05-2021(1).json',
 'analytics-raw-data/fga-eps-mds-2020_2-frontend-23-03-2021.json',
 'analytics-raw-data/fga-eps-mds-2020_2-etl_news-09-05-2021(2).json',
 'analytics-raw-data/fga-eps-mds-2020_2-frontend-10-05-2021.json',
 'analytics-raw-data/fga-eps-mds-2020_2-etl_twitter-10-05-2021.json',
 'analytics-raw-data/fga-eps-mds-2020_2-etl_twitter-04-05-2021.json',
 'analytics-raw-data/fga-eps-mds-2020

In [162]:
def read_json(json_path):
    
    with open(json_path) as json_file:
        json_obj = json.load(json_file)
        
    return json_obj

def create_base_component_df(json_list):
    
    df = pd.DataFrame()

    for i in json_list:

        base_component = read_json(i)

        base_component_data = base_component['baseComponent']['measures']

        base_component_df = pd.DataFrame(base_component_data)

        base_component_df['filename'] = os.path.basename(i)

        df = df.append(base_component_df, ignore_index=True)
        
    aux_df = df['filename'].str.split(r"fga-eps-mds-2020_2-(.*?)-(.*?).json", expand=True)
    
    df['repository'] = aux_df[1]
    
    df['version'] = aux_df[2]
    
    df = df.sort_values(by=['repository', 'version'])
        
    return df, repos

#### Create base component dataframe and repos list

In [163]:
base_component_df, repos = create_base_component_df(jsons)

In [164]:
base_component_df.head(10)

Unnamed: 0,metric,value,bestValue,filename,repository,version
161,duplicated_lines_density,0.0,True,fga-eps-mds-2020_2-etl_camara-01-05-2021.json,etl_camara,01-05-2021
162,functions,30.0,,fga-eps-mds-2020_2-etl_camara-01-05-2021.json,etl_camara,01-05-2021
163,security_rating,3.0,False,fga-eps-mds-2020_2-etl_camara-01-05-2021.json,etl_camara,01-05-2021
164,files,5.0,,fga-eps-mds-2020_2-etl_camara-01-05-2021.json,etl_camara,01-05-2021
165,complexity,88.0,,fga-eps-mds-2020_2-etl_camara-01-05-2021.json,etl_camara,01-05-2021
166,ncloc,525.0,,fga-eps-mds-2020_2-etl_camara-01-05-2021.json,etl_camara,01-05-2021
167,coverage,0.0,False,fga-eps-mds-2020_2-etl_camara-01-05-2021.json,etl_camara,01-05-2021
168,reliability_rating,1.0,True,fga-eps-mds-2020_2-etl_camara-01-05-2021.json,etl_camara,01-05-2021
169,comment_lines_density,9.2,False,fga-eps-mds-2020_2-etl_camara-01-05-2021.json,etl_camara,01-05-2021
312,duplicated_lines_density,0.0,True,fga-eps-mds-2020_2-etl_camara-02-05-2021.json,etl_camara,02-05-2021


#### Create dataframe per file

In [165]:
metric_list = ['files',
               'functions',
               'complexity',
               'comment_lines_density',
               'duplicated_lines_density',
               'coverage',
               'ncloc',
               'security_rating',
               'tests',
               'test_success_density',
               'test_execution_time',
               'reliability_rating']

len(metric_list)

12

In [166]:
def metric_per_file(json):
    
    file_json = []
    
    for component in json['components']:
        if component['qualifier'] == 'FIL':
            file_json.append(component)
            
    return file_json

In [167]:
def generate_file_dataframe_per_release(metric_list, json, language_extension):
    
    df_columns = metric_list
    df = pd.DataFrame(columns = df_columns)
    
    for file in json:
        try:
            if file['language'] == language_extension:
                for measure in file['measures']:
                    df.at[file['path'], measure['metric']] = measure['value']
        except:
            pass
        
    df.reset_index(inplace = True)
    df = df.rename({'index': 'path'}, axis=1).drop(['files'], axis=1)

    return df

In [168]:
def create_file_df(json_list):
    
    df = pd.DataFrame()

    for i in json_list:

        file_component = read_json(i)
        file_component_data = metric_per_file(file_component)
        file_name = os.path.basename(i)
        file_repository =  = file_name.split(r"fga-eps-mds-2020_2-(.*?)-(.*?).json")[0]
        file_version = file_name.split(r"fga-eps-mds-2020_2-(.*?)-(.*?).json")[1]
        file_lenguage = repos_lenguage[f"{files_repository}"]
        
        file_component_df = generate_file_dataframe_per_release(metric_list, 
                                                                file_component_data,
                                                                language_extension = file_lenguage)
        file_component_df['filename'] = file_name

        df = df.append(file_component_df, ignore_index=True)
        
    # replace TeamName by yours.    
    aux_df = df['filename'].str.split(r"TeamName-(.*?)-date_(.*?).json", expand=True)
    
    df['repository'] = aux_df[1]
    
    df['version'] = aux_df[2]
    
    df = df.sort_values(by=['repository', 'version'])
        
    return df

In [169]:
file_component_df = create_file_df(jsons)

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
file_component_df.filename.unique()

In [None]:
file_component_df.head(10)

In [None]:
file_component_df.to_excel('data/data.xlsx', index = False)

In [None]:
file_component_df.repository.unique()

#### Create dataframe per repository

In [None]:
repository_dataframes = {} 

for repository in repos:
    df = file_component_df[file_component_df['repository'] == f"{repository}"]
    df.name = f"{repository}"
    repository_dataframes[f"{repository}"] = df
    
repository_dataframes.keys()

In [None]:
repository_dataframes["etl_tse"]

### Metric calculations

##### COMPLEXITY

In [None]:
def m1(df):
    
    density_non_complex_files = len(df[(df['complexity'].astype(float)/df['functions'].astype(float)) < 10])/len(df)
    
    return density_non_complex_files

##### COMMENTS

In [None]:
def m2(df):
    
    density_comment_files = len(df[(df['comment_lines_density'].astype(float) > 10) & (df['comment_lines_density'].astype(float) < 30)])/len(df)
    
    return density_comment_files

##### DUPLICATIONS

In [None]:
def m3(df):
    
    duplication = len(df[(df['duplicated_lines_density'].astype(float) < 5)])/len(df)
    
    return duplication

### Calculate m1, m2 and m3 for each repository

In [None]:
def create_metrics_df(df):
    
    version_vec = df['version'].unique()
    
    m1_list = []
    m2_list = []
    m3_list = []
    repository_list = []
    version_list = []
    
    metrics_df = pd.DataFrame()
    
    for version in version_vec:

        version_df = df[df['version'] == version]

        m1_list.append(m1(version_df))
        m2_list.append(m2(version_df))
        m3_list.append(m3(version_df))
        repository_list.append(version_df['repository'].iloc[0])
        version_list.append(version)
        
    metrics_df = pd.DataFrame({'m1': m1_list,
                               'm2': m2_list,
                               'm3': m3_list,
                               'repository': repository_list, 
                               'version': version_list})
        
    return metrics_df

In [None]:
repository_metrics = {}

for repository, repo_df in repository_dataframes.items():   
    metrics_df = create_metrics_df(repo_df)
    metrics_df.name = f"{repository}"
    repository_metrics[f"{repository}"] = metrics_df


### Data visualization

- You must do this for each of your repositories

In [None]:
for repository, metrics_df in repository_metrics.items(): 
    fig = plt.figure(figsize=(20, 10))
    plt.title(f"{repository}")
    plt.plot(metrics_df['m1'], linewidth=3, marker='o', markersize=10)

In [None]:
for repository, metrics_df in repository_metrics.items(): 
    fig = plt.figure(figsize=(20, 10))
    plt.title(f"{repository}")
    plt.plot(metrics_df['m2'], linewidth=3, marker='o', markersize=10)

In [None]:
for repository, metrics_df in repository_metrics.items(): 
    fig = plt.figure(figsize=(20, 10))
    plt.title(f"{repository}")
    plt.plot(metrics_df['m3'], linewidth=3, marker='o', markersize=10)

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo1['m1'], linewidth=3, marker='o', markersize=10)

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo1['m2'], linewidth=3, marker='o', markersize=10)

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo1['m3'], linewidth=3, marker='o', markersize=10)

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo1['m1'], linewidth=3, marker='o', markersize=10)
plt.plot(repo1['m2'], linewidth=3, marker='o', markersize=10)
plt.plot(repo1['m3'], linewidth=3, marker='o', markersize=10)

### Sub characteristic aggregation

- You must do this for each of your repositories

In [None]:
psc1 = 1
pm1 = 0.33
pm2 = 0.33
pm3 = 0.33

repo1['asc1'] = ((repo1['m1']*pm1)+(repo1['m2']*pm2)+(repo1['m3']*pm3))*psc1
repo2['asc1'] = ((repo2['m1']*pm1)+(repo2['m2']*pm2)+(repo2['m3']*pm3))*psc1
...

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo1['asc1'], linewidth=3, marker='o', markersize=10)

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo2['asc1'], linewidth=3, marker='o', markersize=10)

In [None]:
fig = plt.figure(figsize=(20, 10))


plt.plot(repo1['asc1'], linewidth=3, marker='o', markersize=5)
plt.plot(repo2['asc1'], linewidth=3, marker='o', markersize=5)
...

In [None]:
metrics_df = pd.concat([repo1_metrics, repo2_metrics, ...], ignore_index=True)

metrics_df['ac1'] = metrics_df['asc1'] * 1
metrics_df['total'] = metrics_df['asc1'] * 1

In [None]:
metrics_df

In [None]:
metrics_df.to_excel('data/metrics_df.xlsx', index = False)