In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR 
from sklearn.tree import DecisionTreeRegressor  
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor 

from sklearn.metrics import root_mean_squared_error, r2_score

In [None]:
df = pd.read_csv('data/development.csv').set_index('Id')
df.head()

In [None]:
df.info()

In [4]:
df['tempo'] = df['tempo'].apply(lambda x: x.replace('[', '').replace(']', '')).astype('float')
df.drop(['path', 'sampling_rate'], axis=1, inplace=True)

In [None]:
df['ethnicity'].unique(), df['ethnicity'].unique().__len__()

In [6]:
df['igbo'] = df['ethnicity'].apply(lambda z: 1 if z == 'igbo' else 0)
df.drop('ethnicity', axis=1, inplace=True)
df['gender'] = df['gender'].map(lambda x: 1 if x == 'male' else 0)

In [None]:
# df = pd.get_dummies(df, columns=['ethnicity'], drop_first=True)

In [None]:
df.info()

In [10]:
# df = df[df['num_characters'] != 281].drop(['num_words', 'num_characters'], axis=1)

In [11]:
y = df['age']
X = df.drop(columns='age')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
for model in [LinearRegression, Lasso, Ridge, RandomForestRegressor, SVR, DecisionTreeRegressor, KNeighborsRegressor, MLPRegressor]:
    clf = model()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'{model.__name__} RMSE: {rmse}')

In [24]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)  

In [None]:
import matplotlib.pyplot as plt

plt.hist(y_test, bins=20, alpha=0.5, label='True')
plt.hist(y_pred, bins=20, alpha=0.5, label='Predicted')
plt.legend()

In [None]:
import numpy as np
plt.figure(figsize=(10, 10))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.xticks(np.arange(int(y_test.min()), int(y_test.max()) + 1, 1));

In [None]:
import numpy as np 
err = np.sqrt(np.sort((y_test - y_pred) ** 2))
plt.plot(err)

In [None]:
y_test.max(), y_pred.max()
err.max()

In [None]:
import numpy as np  

importances = clf.coef_ 
indices = np.argsort(importances)

plt.figure(figsize=(10, 15))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [X.columns[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
# Valori unici e frequenza per la colonna num_words
num_words_counts = df['num_words'].value_counts()
print("Valori unici e frequenza per la colonna num_words:")
print(num_words_counts)

# Valori unici e frequenza per la colonna num_characters
num_characters_counts = df['num_characters'].value_counts()
print("\nValori unici e frequenza per la colonna num_characters:")
print(num_characters_counts)

In [None]:
df[df['num_characters'] == 0]

In [None]:
ev = pd.read_csv('data/evaluation.csv').set_index('Id')
df = pd.read_csv('data/development.csv').set_index('Id')

ethnicity_counts_df = df['ethnicity'].value_counts() / len(df)
ethnicity_counts_ev = ev['ethnicity'].value_counts() / len(ev)

st = pd.DataFrame([ethnicity_counts_df, ethnicity_counts_ev]).T.dropna()
st.head()

In [None]:
import matplotlib.pyplot as plt

# Lista delle colonne da escludere
exclude_columns = ['ethnicity', 'sampling_rate', 'path', 'age']

# Filtra le colonne da visualizzare
columns_to_plot = [col for col in df.columns if col not in exclude_columns]

df['tempo'] = df['tempo'].apply(lambda x: x.replace('[', '').replace(']', '')).astype('float')
df.drop(['path', 'sampling_rate'], axis=1, inplace=True)

ev['tempo'] = ev['tempo'].apply(lambda x: x.replace('[', '').replace(']', '')).astype('float')
ev.drop(['path', 'sampling_rate'], axis=1, inplace=True)

# Crea i grafici
for col in columns_to_plot:
    plt.figure(figsize=(10, 5))
    plt.hist(df[col], bins=20, alpha=0.5, label='Development', color='blue', density=True)  
    plt.hist(ev[col], bins=20, alpha=0.5, label='Evaluation', color='orange', density=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Density')
    plt.legend()
    plt.show()

In [60]:
df['igbo'] = df['ethnicity'].apply(lambda z: 1 if z == 'igbo' else 0)
df.drop('ethnicity', axis=1, inplace=True)
df['gender'] = df['gender'].map(lambda x: 1 if x == 'male' else 0)

ev['igbo'] = ev['ethnicity'].apply(lambda z: 1 if z == 'igbo' else 0)
ev.drop('ethnicity', axis=1, inplace=True)
ev['gender'] = ev['gender'].map(lambda x: 1 if x == 'male' else 0)

In [61]:
df.drop(['age'], axis=1, inplace=True)

In [62]:
ss = StandardScaler()
cols = df.columns
df = ss.fit_transform(df)
ev = ss.transform(ev)

In [None]:
from sklearn.decomposition import PCA

# Esegui la PCA
pca = PCA()
X_pca = pca.fit_transform(df)

# Mostra la varianza spiegata
explained_variance = pca.explained_variance_ratio_
print("Explained variance ratio:", explained_variance)

# Grafico della varianza spiegata cumulativa
plt.figure(figsize=(10, 5))
plt.plot(np.cumsum(explained_variance))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by PCA Components')
plt.grid(True)
plt.show()

In [None]:
# Ottieni i pesi delle prime 5 componenti principali
components = pca.components_[:5]

# Crea un grafico a barre orizzontali per ogni componente
for i, component in enumerate(components):
    plt.figure(figsize=(10, 5))
    plt.barh(range(len(component)), component, align='center')
    plt.yticks(range(len(component)), cols)
    plt.xlabel('Peso')
    plt.title(f'Component {i+1}')
    plt.show()

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Inizializza il modello LDA
lda = LinearDiscriminantAnalysis()

# Adatta il modello ai dati di addestramento
lda.fit(X_train, y_train)

# Trasforma i dati di test
X_test_lda = lda.transform(X_test)

# Trasforma i dati di addestramento
X_train_lda = lda.transform(X_train)

# Stampa le prime righe dei dati trasformati
print("Dati di test trasformati con LDA:")
print(X_test_lda[:5])

print("\nDati di addestramento trasformati con LDA:")
print(X_train_lda[:5])

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

# Supponiamo che la feature su cui vuoi fare stratified sampling sia 'gender'
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(df, df['igbo']):
    X_train_strat, X_test_strat = df.iloc[train_index], df.iloc[test_index]
    y_train_strat, y_test_strat = df.iloc[train_index], df.iloc[test_index]

# Verifica la distribuzione della feature 'gender' nei set di train e test
print("Distribuzione di 'gender' nel set di train:")
print(y_train_strat.value_counts(normalize=True))

print("\nDistribuzione di 'gender' nel set di test:")
print(y_test_strat.value_counts(normalize=True))