# Changes

I'm going to put all of the changes I want to make to both the train and test function here. It's a single function that takes the name of a csv as a parameter. Then I can just apply the function to the  CSVs and read out the finished pickles, confident that the same changes were made to both. This makes more sense than making sure that my notebooks work with both sets (by for instance always checking if the 'SalePrice' column exists before I reference it.) It also has the bonus of making much of mode code more organized and easier to read, without all my distracting commentary, charts &c.

In [6]:
import pandas as pd
import numpy as np 
import os

In [35]:
def changes(csv):
    ames = pd.read_csv(f'../datasets/{csv}')
    # null values to 'None'
    ames.fillna('None', inplace = True)
    # giving ordinal features numeric values
    to_numeric = {'Ex': 5, 'Gd': 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1, 'None' : 0}
    ames[['Bsmt Qual', 'Bsmt Cond']] = ames[['Bsmt Qual', 'Bsmt Cond']].replace(to_numeric)
    to_numeric = {'Glq': 3, 'Unf' : 2, 'AlQ' : 2, 'BLQ':1, 'LwQ': 1, 'Rec' : 1, 'None' : 0}
    ames['BsmtFin Type 1'] = ames['BsmtFin Type 1'].replace(to_numeric)
    to_numeric = {'Ex': 5, 'Gd': 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1, 'None' : 0}
    ames[['Bsmt Qual', 'Bsmt Cond']] = ames[['Bsmt Qual', 'Bsmt Cond']].replace(to_numeric)
    to_numeric={'Typ': 7,'Min1' :6, 'Min2':5, 'Mod':4, 'Maj1':3,'Maj2':2, 'Sev':1 , 'Sal':0	}
    ames['Functional'].replace(to_numeric, inplace = True)
    #dropping extraneous data
    ames.drop('BsmtFin Type 2', axis = 1, inplace = True)
    ames.drop('Bsmt Half Bath',axis = 1, inplace = True)
    # changing dtypes
    ames['MS SubClass'].apply(lambda x: str(x))
    #save as pickle
    ames.to_pickle(f'../datasets/pickles/pickled_{csv}.pkl')
    # This will clean code and prepare it for the "3_Feature_Engineering" notebook. 
    
    
    # Feature Engineering Transformations
    
    # variables for feature engineering, see EDA notebook for further explanation
    p25 =  129825.0
    p75 = 214000.0
    mean = 181469.70160897123
    res = ['RL', 'RM', 'FV', 'RH']
    neighs_1 = [ 'Blueste','Edwards','Landmrk', 'Mitchel', 'NAmes', 'NPkVill', 'SWISU', 'Sawyer']
    neighs_2 = ['Blmngtn', 'CollgCr', 'Crawfor', 'Gilbert','Greens', 'NWAmes', 'SawyerW']
    neighs_3 = ['ClearCr','GrnHill', 'NoRidge', 'NridgHt', 'Somerst', 'StoneBr', 'Timber', 'Veenker']
    ex0 = ['AsbShng','AsphShn', 'Cblock']
    ex1 = ['BrkComm', 'HdBoard', 'MetalSd','Plywood','Stucco','Wd Sdng','WdShing']
    ex2 =['BrkFace']
    ex3 = ['CemntBd','ImStucc','Stone','VinylSd']
    ex20 = ['AsbShng', 'CBlock']
    ex21 = ['AsphShn', 'Brk Cmn', 'HdBoard', 'MetalSd', 'Plywood', 'Stone', 'Stucco', 'Wd Sdng', 'Wd Shng']
    ex22 =  ['BrkFace']
    ex23 = ['CmentBd', 'ImStucc', 'VinylSd']
    garage_feats = ['Garage Type', 'Garage Yr Blt',
       'Garage Finish', 'Garage Cars', 'Garage Area', 'Garage Qual',
       'Garage Cond', 'SalePrice']
    
    
    # add NeighborhoodScore
    ames['NeighborhoodScore'] = 0
    ames.loc[ames['Neighborhood'].isin(neighs_1), 'NeighborhoodScore'] = 1
    ames.loc[ames['Neighborhood'].isin(neighs_2), 'NeighborhoodScore'] = 2
    ames.loc[ames['Neighborhood'].isin(neighs_3), 'NeighborhoodScore'] = 3
    
    
    #Convert Zoning to residential or not
    ames['isResidential'] = (ames['MS Zoning'].isin(res))*1
    ames.drop('MS Zoning',1, inplace = True)
    
    # Simplify Condition 1
    ames['Condition 1'].replace('PosA', 'PosN', inplace = True)
    ames['Condition 1'].replace('RRNn', 'RR', inplace = True)
    ames['Condition 1'].replace('RRAn', 'RR', inplace = True)
    ames['Condition 1'].replace('RRNe', 'RR', inplace = True)
    ames['Condition 1'].replace('RRAe', 'RR', inplace = True)
    
    #Roof Aggregation
    
    ames['roof'] = ames['Roof Style'] + ames['Roof Matl']
    ames.drop('Roof Style', 1, inplace = True)
    ames.drop('Roof Matl', 1, inplace = True)
    
    #OneHots 
    
    ames['Central Air'].replace('Y', 1, inplace = True)
    ames['Central Air'].replace('N', 0, inplace = True)
    ames = pd.get_dummies(data = ames, columns = ['Sale Type'], drop_first = True, prefix = 'Sale Type_')
    
    ames['Street'].replace('Pave', 1, inplace = True)
    ames['Street'].replace('Grvl', 0, inplace = True)
    ames['Kitchen Qual']
    
    
    #Dropping some columns
    
    ames.drop('Utilities', 1, inplace = True)
    ames.drop('Condition 2', 1, inplace = True)
    ames.drop('Land Slope', 1, inplace = True)
    ames.drop('Neighborhood', 1, inplace = True)
    ames.drop('Alley', 1, inplace = True)
    ames.drop('PID', 1, inplace = True)
    ames.drop('Id', 1, inplace = True)
    to_numeric= {'Fa':0, 'TA':1, 'Gd':2, 'Ex':3}
    ames['Exter Qual'] = ames['Exter Qual'].replace(to_numeric)
    
    # Exteriors
    
    
    to_numeric= {'Po': 0, 'Fa':1, 'Gd':2, 'TA':3, 'Ex':4}
    ames['Exter Cond'] = ames['Exter Cond'].replace(to_numeric)
    ames['Ex1Score'] = 0
    ames.loc[ames['Exterior 1st'].isin(ex1), 'Exterior 1st'] = 1
    ames.loc[ames['Exterior 1st'].isin(ex2), 'Exterior 1st'] = 2
    ames.loc[ames['Exterior 1st'].isin(ex3), 'Exterior 1st'] = 3
    ames.drop('Exterior 1st',1, inplace = True)
    
    ames['Ex2Score'] = 0
    ames.loc[ames['Exterior 2nd'].isin(ex21), 'Exterior 2nd'] = 1
    ames.loc[ames['Exterior 2nd'].isin(ex22), 'Exterior 2nd'] = 2
    ames.loc[ames['Exterior 2nd'].isin(ex23), 'Exterior 2nd'] = 3
    
    ames['Exter Qual * Cond']= ames['Exter Qual'] * ames['Exter Cond']
    ames.drop(['Exter Qual', 'Exter Cond'], 1, inplace = True)
    
    # Garage
    
    to_numeric = {'None':0, 'Po':1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex':5}
    ames['Garage Qual'] = ames['Garage Qual'].replace(to_numeric)
    ames['Garage Qual'] = ames['Garage Qual'].replace(to_numeric)

    
    # Kitchen
    to_numeric = {'Fa':0, 'TA': 1, 'Gd' : 2, 'Ex':3}
    ames['Kitchen Qual'] = ames['Kitchen Qual'].replace(to_numeric)
    
    
    #to_numeric = {'None':0, 'TA': 1, 'Fa': 2, 'Gd' : 3, 'Ex':4}
    ames['Pool QC'] = ames['Pool QC'].replace(to_numeric)

    
    

    
    for column in ames.columns:
    
        if str(ames[column].dtype) == 'object': 
            try:
                ames[column] = ames[column].replace('None', 0)
                ames[column] = ames[column].apply(lambda x : float(x))
            except:
                pass
    
    
    
    #save as pickle
    ames.to_pickle(f'../datasets/pickles/pickled_{csv}.pkl')
   
    

In [36]:
TrainTest = ['test.csv', 'train.csv']
for csv in TrainTest:
    changes(csv)

In [39]:
proof = pd.read_pickle('../datasets/pickles/pickled_train.csv.pkl')

In [38]:
proof.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 79 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   MS SubClass        2051 non-null   int64  
 1   Lot Frontage       2051 non-null   float64
 2   Lot Area           2051 non-null   int64  
 3   Street             2051 non-null   int64  
 4   Lot Shape          2051 non-null   object 
 5   Land Contour       2051 non-null   object 
 6   Lot Config         2051 non-null   object 
 7   Condition 1        2051 non-null   object 
 8   Bldg Type          2051 non-null   object 
 9   House Style        2051 non-null   object 
 10  Overall Qual       2051 non-null   int64  
 11  Overall Cond       2051 non-null   int64  
 12  Year Built         2051 non-null   int64  
 13  Year Remod/Add     2051 non-null   int64  
 14  Exterior 2nd       2051 non-null   object 
 15  Mas Vnr Type       2051 non-null   object 
 16  Mas Vnr Area       2051 