In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [2]:
houses = pd.read_csv('OneDrive\Documents\my_datasets\AmesHousing.tsv', delimiter="\t")
houses.shape

(2930, 82)

In [3]:
houses.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [4]:
def transform_features(df):
    return df

def select_features(df):
    return df[["Gr Liv Area", "SalePrice"]]

def train_and_test(df):  
    train = df[:1460]
    test = df[1460:]
    
    numeric_train = df.select_dtypes(include=['float', 'integer'])
    
    features = numeric_train.columns.drop('SalePrice')
    target = 'SalePrice'
    
    lr = LinearRegression()
    lr.fit(train[features], train[target])
    predictions = lr.predict(test[features])
    mse = mean_squared_error(test[target], predictions)
    rmse = np.sqrt(mse)
    
    return rmse

In [5]:
transform_df = transform_features(houses)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)

rmse

57088.25161263909

## Feature Engineering

 -    Gérez les valeurs manquantes:
      -   Toutes les colonnes:
          -   Supprimez tout avec 5% ou plus de valeurs manquantes.
      -   Colonnes de texte:
          -   Supprimez tout avec une ou plusieurs valeurs manquantes.
      -   Colonnes numériques:
          -   Pour les colonnes avec des valeurs manquantes, remplir avec la valeur la plus courante dans cette colonne.

1: Toutes les colonnes: supprimez celles avec 5% ou plus de valeurs manquantes.

In [6]:
cols_with_na = houses.isnull().sum()
cols_2drop = cols_with_na[cols_with_na >= len(houses)/20]
houses.drop(cols_2drop.index, axis=1, inplace=True)

2: Colonnes de texte: supprimez celles avec une ou plusieurs valeurs manquantes.

In [7]:
col_text_mv = houses.select_dtypes(include=['object']).isnull().sum()
col_text_2drop = col_text_mv[col_text_mv>0]
houses.drop(col_text_2drop.index, axis=1, inplace=True)

3: Colonnes numériques: Pour les colonnes avec des valeurs manquantes, remplir avec la valeur la plus courante dans cette colonne

In [8]:
num_missing = houses.select_dtypes(include=['int', 'float']).isnull().sum()
num_missing

Mas Vnr Area      23
BsmtFin SF 1       1
BsmtFin SF 2       1
Bsmt Unf SF        1
Total Bsmt SF      1
Bsmt Full Bath     2
Bsmt Half Bath     2
Garage Cars        1
Garage Area        1
dtype: int64

In [9]:
replacement_values_dict = houses[num_missing.index].mode().to_dict(orient='records')[0]
replacement_values_dict

{'Mas Vnr Area': 0.0,
 'BsmtFin SF 1': 0.0,
 'BsmtFin SF 2': 0.0,
 'Bsmt Unf SF': 0.0,
 'Total Bsmt SF': 0.0,
 'Bsmt Full Bath': 0.0,
 'Bsmt Half Bath': 0.0,
 'Garage Cars': 2.0,
 'Garage Area': 0.0}

In [10]:
houses = houses.fillna(replacement_values_dict)
houses.isnull().sum()

Order             0
PID               0
MS SubClass       0
MS Zoning         0
Lot Area          0
                 ..
Mo Sold           0
Yr Sold           0
Sale Type         0
Sale Condition    0
SalePrice         0
Length: 64, dtype: int64

In [11]:
houses.isnull().sum().value_counts()

0    64
dtype: int64

In [12]:
years_sold = houses['Yr Sold'] - houses['Year Built']
years_sold[years_sold < 0]

2180   -1
dtype: int64

In [13]:
years_since_remod = houses['Yr Sold'] - houses['Year Remod/Add']
years_since_remod[years_since_remod < 0]

1702   -1
2180   -2
2181   -1
dtype: int64

Quelles nouvelles fonctionnalités pouvons-nous créer, qui capturent mieux les informations de certaines fonctionnalités?

In [14]:
houses['Years Before Sale'] = years_sold
houses['Years Since Remod'] = years_since_remod

houses = houses.drop([1702, 2180, 2181], axis=0)

houses = houses.drop(["Year Built", "Year Remod/Add"], axis = 1)

Supprimez les colonnes qui:

  -   ne sont pas utiles pour le ML
  -   fuite de données sur la vente finale, en peut savoir plus sur les colonnes [ici](http://jse.amstat.org/v19n3/decock/DataDocumentation.txt).

In [15]:
houses = houses.drop(["PID", "Order"], axis=1)
houses = houses.drop(["Mo Sold", "Sale Condition", "Sale Type", "Yr Sold"], axis=1)

## Mettons à jour transform_features ()

In [16]:
def transform_features(df):
    cols_with_na = df.isnull().sum()
    cols_2drop = cols_with_na[cols_with_na >= len(df)/20]
    df.drop(cols_2drop.index, axis=1, inplace=True)
    
    col_text_mv = df.select_dtypes(include=['object']).isnull().sum()
    col_text_2drop = col_text_mv[col_text_mv>0]
    df.drop(col_text_2drop.index, axis=1, inplace=True)
    
    num_missing = df.select_dtypes(include=['int', 'float']).isnull().sum()
    replacement_values_dict = df[num_missing.index].mode().to_dict(orient='records')[0]
    df = df.fillna(replacement_values_dict)
    years_sold = df['Yr Sold'] - df['Year Built']
    years_since_remod = df['Yr Sold'] - df['Year Remod/Add']
    df['Years Before Sale'] = years_sold
    df['Years Since Remod'] = years_since_remod
    df = df.drop([1702, 2180, 2181], axis=0)
    df = df.drop(["Year Built", "Year Remod/Add",
                          "PID", "Order","Mo Sold", "Sale Condition", "Sale Type", "Yr Sold"], axis = 1)
    
    return df

In [17]:
houses = pd.read_csv('OneDrive\Documents\my_datasets\AmesHousing.tsv', delimiter="\t")
transform_df  = transform_features(houses)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)
rmse


55275.367312413066

## Feature Selection


In [18]:
num_df = transform_df.select_dtypes(include=['integer', 'float'])

In [19]:
corr_coeffs = num_df.corr()['SalePrice'].abs().sort_values()
corr_coeffs = corr_coeffs[corr_coeffs > 0.4]
corr_coeffs

BsmtFin SF 1         0.439284
Fireplaces           0.474831
TotRms AbvGrd        0.498574
Mas Vnr Area         0.506983
Years Since Remod    0.534985
Full Bath            0.546118
Years Before Sale    0.558979
1st Flr SF           0.635185
Garage Area          0.641425
Total Bsmt SF        0.644012
Garage Cars          0.648361
Gr Liv Area          0.717596
Overall Qual         0.801206
SalePrice            1.000000
Name: SalePrice, dtype: float64

In [20]:
transform_df = transform_df.drop(corr_coeffs[corr_coeffs < 0.4].index, axis=1)

Quelles colonnes catégorielles devons-nous conserver?

In [21]:
nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Type", "Sale Condition"]

In [22]:
transform_category_cols = []
for col in nominal_features:
    if col in transform_df.columns:
        transform_category_cols.append(col)

unique_counts = transform_df[transform_category_cols].apply(lambda col: len(col.value_counts())).sort_values()

unique_cols_2drop = unique_counts[unique_counts > 10].index
transform_df = transform_df.drop(unique_cols_2drop, axis=1)

In [23]:
text_cols = transform_df.select_dtypes(include=['object'])
for col in text_cols:
    transform_df[col] = transform_df[col].astype('category')
    dummy = pd.get_dummies(transform_df.select_dtypes(include=['category']))
transform_df = pd.concat([transform_df, dummy], axis=1).drop(text_cols,axis=1)

## Mettons à jour select_features ()

In [24]:
def select_features(df):
    num_df = df.select_dtypes(include=['integer', 'float'])
    corr_coeffs = num_df.corr()['SalePrice'].abs().sort_values()
    df = df.drop(corr_coeffs[corr_coeffs < 0.4].index, axis=1)
    
    nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                        "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                        "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                        "Misc Feature", "Sale Type", "Sale Condition"]
    
    transform_category_cols = []
    for col in nominal_features:
        if col in df.columns:
            transform_category_cols.append(col)

    unique_counts = transform_df[transform_category_cols].apply(lambda col: len(col.value_counts())).sort_values()
    unique_cols_2drop = unique_counts[unique_counts > 10].index
    df = df.drop(unique_cols_2drop, axis=1)
    
    
    text_cols = df.select_dtypes(include=['object'])
    for col in text_cols:
        df[col] = df[col].astype('category')
    
    dummy = pd.get_dummies(df.select_dtypes(include=['category']))
    df = pd.concat([df, dummy], axis=1).drop(text_cols,axis=1)
        
        
    return df

## Mettons à jour train_and_test ()

In [25]:
def train_and_test(df, k=0):

    numeric_df = df.select_dtypes(include=['integer', 'float'])
    features = numeric_df.columns.drop("SalePrice")
    lr = LinearRegression()
    
    if k == 0:
        train = df[:1460]
        test = df[1460:]

        lr.fit(train[features], train["SalePrice"])
        predictions = lr.predict(test[features])
        mse = mean_squared_error(test["SalePrice"], predictions)
        rmse = np.sqrt(mse)

        return rmse
    
    if k == 1:
        shuffled_df = df.sample(frac=1, )
        train = df[:1460]
        test = df[1460:]
        
        lr.fit(train[features], train["SalePrice"])
        predictions_one = lr.predict(test[features])        
        
        mse_one = mean_squared_error(test["SalePrice"], predictions_one)
        rmse_one = np.sqrt(mse_one)
        
        lr.fit(test[features], test["SalePrice"])
        predictions_two = lr.predict(train[features])        
       
        mse_two = mean_squared_error(train["SalePrice"], predictions_two)
        rmse_two = np.sqrt(mse_two)
        
        avg_rmse = np.mean([rmse_one, rmse_two])
        print(rmse_one)
        print(rmse_two)
        return avg_rmse
    else:
        kf = KFold(n_splits=k, shuffle=True)
        rmse_values = []
        for train_index, test_index, in kf.split(df):
            train = df.iloc[train_index]
            test = df.iloc[test_index]
            lr.fit(train[features], train["SalePrice"])
            predictions = lr.predict(test[features])
            mse = mean_squared_error(test["SalePrice"], predictions)
            rmse = np.sqrt(mse)
            rmse_values.append(rmse)
        print(rmse_values)
        avg_rmse = np.mean(rmse_values)
        return avg_rmse

In [26]:
houses = pd.read_csv('OneDrive\Documents\my_datasets\AmesHousing.tsv', delimiter="\t")
transform_df = transform_features(houses)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df, k=4)

rmse

[28447.33683516985, 27257.477236179984, 34844.4410277774, 25373.059927206235]


28980.578756583367