# Provo a predire l' età dei passeggeri in modo da non dover eliminare dei record durante il train
## Importo le librerie

In [247]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

## Load dei dati

In [248]:
df = pd.read_csv('train.csv') #Load dei dati
np.random.seed(0)

## Inizio Preprocessing dei dati

In [249]:
#sex = pd.get_dummies(df['Sex']) #codifica onehot del sesso dei passeggeri
pclass = pd.get_dummies(df['Pclass']) #codifica onehot della classe dei passeggeri
df = df.drop(df.columns[[0, 2, 3, 6, 8, 11]], axis=1) #Drop delle colonne con influenza ~ 0
f_cabin = lambda x: 1 if pd.notnull(x) else 0 #se il valore non è nullo allora metto uno 0 altrimenti, utile per la cabina
f_parch = lambda x: 1 if x != 0 else 0 #lambda per i parenti, 0 se è da solo 1 altrimenti
f_sex = lambda x: 1 if x == 'male' else 0 #applicare questa lambda funziona da paura
df['Cabin'] = pd.Series.apply(func=f_cabin, self=df['Cabin'], convert_dtype=False) #applico la lambda definita sopra
df['Parch'] = pd.Series.apply(func=f_parch, self=df['Parch'], convert_dtype=False)
df['Sex'] = pd.Series.apply(func=f_sex, self=df['Sex'], convert_dtype=False)

scaler = MinMaxScaler(feature_range=(0,100))
vector = np.array(df['Fare'], dtype=np.float32)
df['Fare'] = scaler.fit_transform(vector.reshape(-1,1))
#join delle cofifiche onehot
df = df.join(pclass)


## Load di Train e Test set

In [250]:
train = df[df['Age'].notnull()] #Il train-set è dato da tutti i dati con età non null
test = df[df['Age'].isnull()] #il prediction set è dato da tutti i dati con età null
test = test.drop(test.columns[[2]], axis=1) #rimuovo il campo età poichè è il mio y_hat

In [251]:
def agenize(data):
    for i in range(len(data)):
        if data[i] < 12:
            data[i] = 1
        elif data[i] > 12 and data[i] <= 18:
            data[i] = 2
        elif data[i] > 18 and data[i] <= 30:
            data[i] = 3
        elif data[i] > 30 and data[i] <= 45:
            data[i] = 4
        elif data[i] > 45 and data[i] <= 60:
            data[i] = 5
        else:
            data[i] = 6
            
l = list(train['Age'])
pd.options.mode.chained_assignment = None
agenize(l)
train['Age'] = l

## Load di X ed Y dal Train Set

In [253]:
from sklearn.model_selection import train_test_split
train_train, train_test = train_test_split(train, test_size=0.2)
y_train = np.array(train_train['Age'], dtype=np.int8) #carico il label di train
y_test = np.array(train_test['Age'], dtype=np.int8)
X_train = np.array(train_train.drop(train_train.columns[[2]], axis=1),  dtype=np.int8) #rimuovo il label dal mio training set
X_test = np.array(train_test.drop(train_test.columns[[2]], axis=1) ,  dtype=np.int8)

# Ora inizia il machine learning

In [254]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR

In [255]:
rid = Ridge(alpha=0.01, fit_intercept=True, max_iter=20000, normalize=True, solver='auto')
las = Lasso(alpha=0.01, fit_intercept=True, max_iter=20000, normalize=True )
ada = AdaBoostRegressor(learning_rate=0.01, n_estimators=250)

In [256]:
rid = rid.fit(X_train, y_train)
las = las.fit(X_train, y_train)
ada = ada.fit(X_train, y_train)

In [257]:
print(rid.score(X_test, y_test)*100)
print(las.score(X_test, y_test)*100)
print(ada.score(X_test, y_test)*100)

27.7153014294
14.8549467244
26.1620828745


# Provo a trattare il dataset in classification

In [258]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [259]:
adac = AdaBoostClassifier(learning_rate=0.01, n_estimators=350)
#rfc = RandomForestClassifier(n_estimators=300, n_jobs=-1)
svm = SVC(kernel='rbf', gamma=0.1, C=1.1)
#gnb = GaussianNB()
nn = MLPClassifier(activation='relu', alpha=0.01, max_iter=3000, hidden_layer_sizes=(100,30,30))

In [260]:
print(adac.fit(X_train, y_train).score(X_test, y_test)*100)
print(svm.fit(X_train, y_train).score(X_test, y_test)*100)
print(nn.fit(X_train, y_train).score(X_test, y_test)*100)

46.1538461538
42.6573426573
47.5524475524


## Vince AdaBoost

In [261]:
predicted_age = np.array(adac.predict(test), dtype=np.int8)

In [262]:
test['Age'] = predicted_age

In [265]:
frame = [train, test]

In [267]:
result = pd.concat(frame)

In [269]:
result.reset_index()
result

Unnamed: 0,Age,Cabin,Fare,Parch,Sex,Survived,1,2,3
0,3,0,1.415106,0,1,0,0,0,1
1,4,1,13.913573,0,0,1,1,0,0
2,3,0,1.546857,0,0,1,0,0,1
3,4,1,10.364429,0,0,1,1,0,0
4,4,0,1.571255,0,1,0,0,0,1
6,5,1,10.122885,0,1,0,1,0,0
7,1,0,4.113566,1,1,0,0,0,1
8,3,0,2.173075,1,0,1,0,0,1
9,2,0,5.869429,0,0,1,0,1,0
10,1,1,3.259623,1,0,1,0,0,1


In [270]:
result.to_csv('filled_age.csv')