## ANÁLISIS DE LA BASE DE DATOS DE CS TRACK

Este script pretende entender como está distribuida la base de datos proporcionada por CS Track. Analizar aspectos generales y familizarizarse con la BD.

### 0. Imports

In [None]:
import math
import numpy as np
import json
import pandas as pd

### 1. Load BD

In [None]:
data_path = './Data/CSTrack_projects_descriptors.json'

with open(data_path, encoding="utf8") as fp:
    db = json.loads(fp.read())

In [None]:
data = pd.DataFrame(db)
data.head()

### 2. General Information

In [None]:
print("Number of registers: ", len(data))
print("Number of columns: ", len(data.columns))

In [None]:
data.info()

### 3. Insert Data_type column

Se añade una nueva columna para indicar la estructura de los datos de la pàgina web de dosde se ha extraido cada registro de la base de datos.

In [None]:
#Estos IDs han sido proporcionados por CS Track
structured_id = ['3', '5', '17', '30', '38', '39', '68', '75', '80', '84']
not_structured_id = ['1', '2', '16', '32', '33', '40', '41', '44', '45', '52', '55', '56', '57', '58', '59', '60', '61', '62', '74', '78', '79', '81', '82', '90', '94', '100', '106']
semi_structured_id = ['4', '6', '9', '10', '13', '15', '21', '27', '31', '35', '36', '37', '42', '63', '66', '76', '77', '83', '87', '88', '91', '92', '103']

df = data.copy()
df['Data_type'] = np.nan

for idx in range(len(df)):
    if df['Plat Id'][idx] in structured_id:
        df['Data_type'][idx] = 'structured'
    if df['Plat Id'][idx] in not_structured_id:
        df['Data_type'][idx] = 'not-structured'
    if df['Plat Id'][idx] in semi_structured_id:
        df['Data_type'][idx] = 'semi-structured'

df.head()

#### 3.1. Analyze Data_type column

In [None]:
n=df['Data_type'].value_counts()['semi-structured'] + df['Data_type'].value_counts()['not-structured'] + df['Data_type'].value_counts()['structured']
semi_struct = (df['Data_type'].value_counts()['semi-structured'] / n)*100
non_struct = (df['Data_type'].value_counts()['not-structured'] / n)*100
struct = (df['Data_type'].value_counts()['structured'] / n)*100

print("Defined Data_type =", n)
print('Not structured data = ' + str(round(non_struct, 2)) + '%')
print('Semi-structured data = ' + str(round(semi_struct, 2)) +'%')
print('Not structured data = ' + str(round(struct, 2)) + '%')

In [None]:
df.to_csv('BD_CsTrack.csv', index = False, sep=';', encoding='utf-8')