In [40]:
#importing relevant libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import KFold

In [41]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [42]:
#Function to process features data set to create categorical dummy variables   
def process_categories(df):
    categorical_variables = ['Gender','Age','City_Category','Stay_In_Current_City_Years']
    dummy_variables=pd.get_dummies(df[categorical_variables])
    df_towork=pd.concat([dummy_variables,df],axis=1).drop(categorical_variables,axis=1)
    df_towork=pd.concat([pd.get_dummies(df['Occupation'],prefix='Occupation'),df_towork],axis=1).drop('Occupation',axis=1)
    
    #Since male and female are the only 2 genders in the data we can drop one of the variables
    df_towork.drop('Gender_M',axis=1,inplace=True)
    
    cat1=pd.get_dummies(df_towork['Product_Category_1'],prefix='Product_category_1')
    cat2=pd.get_dummies(df_towork['Product_Category_2'],prefix='Product_category_2')
    cat3=pd.get_dummies(df_towork['Product_Category_3'],prefix='Product_category_3')
    df_towork=pd.concat([df_towork,cat1,cat2,cat3],axis=1).drop(['Product_Category_1','Product_Category_2','Product_Category_3'],axis=1)
    return df_towork

In [43]:
def identify_missing_columns(df):
    missing_columns=[]
    for i in range(1,21):
        colname='Product_category_1_' + str(i)
        if colname not in  df.columns:
            missing_columns.append(colname)
    for i in range(1,21):
        colname='Product_category_2_' + str(float(i))
        if colname not in  df.columns:
            missing_columns.append(colname)
    for i in range(1,21):
        colname='Product_category_3_' + str(float(i))
        if colname not in  df.columns:
            missing_columns.append(colname)
            
    for i in missing_columns:
        df[i]=0
    return df

In [46]:
def collate_prod_categories(df):
    for i in range(1,21):
        new_column_name = 'Prod_category_' + str(i)
        category1_name = 'Product_category_1_' + str(i)
        category2_name = 'Product_category_2_' + str(float(i))
        category3_name = 'Product_category_3_' + str(float(i))
        df[new_column_name] = (df[category1_name] | df[category2_name] | df[category3_name])
        cols_to_drop=[category1_name,category2_name,category3_name]
        df=df.drop(cols_to_drop,axis=1)    
    return df

In [47]:
df_train=process_categories(train)
df_train=identify_missing_columns(df_train)
df_train_final=collate_prod_categories(df_train)

In [48]:
df_train_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 61 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   Occupation_0                   550068 non-null  uint8 
 1   Occupation_1                   550068 non-null  uint8 
 2   Occupation_2                   550068 non-null  uint8 
 3   Occupation_3                   550068 non-null  uint8 
 4   Occupation_4                   550068 non-null  uint8 
 5   Occupation_5                   550068 non-null  uint8 
 6   Occupation_6                   550068 non-null  uint8 
 7   Occupation_7                   550068 non-null  uint8 
 8   Occupation_8                   550068 non-null  uint8 
 9   Occupation_9                   550068 non-null  uint8 
 10  Occupation_10                  550068 non-null  uint8 
 11  Occupation_11                  550068 non-null  uint8 
 12  Occupation_12                  550068 non-nu

In [49]:
#Combining steps above to encapsulate it in a single function
def train_and_test(df,k=0):
    #features
    features=df.select_dtypes(include=('int64','uint8')).drop(['Purchase'],axis=1).columns
    
    if k==0:
        train=df[:440055]
        test=df[440055:]
        lr = linear_model.LinearRegression()
        lr.fit(train[features],train['Purchase'])
        predict=lr.predict(test[features])
        mse=mean_squared_error(predict,test['Purchase'])
        rmse=np.sqrt(mse)
        return rmse
    
    if k==1:
        df_shuffle = df.sample(frac=1,)
        fold_one=df_shuffle[:440055]
        fold_two=df_shuffle[440055:]
        lr = linear_model.LinearRegression()
        
        lr.fit(fold_one[features],fold_one['Purchase'])
        predict_one=lr.predict(fold_two[features])
        mse_one=mean_squared_error(predict_one,fold_two['Purchase'])
        rmse_one=np.sqrt(mse_one)
        
        lr = linear_model.LinearRegression()
        lr.fit(fold_two[features],fold_two['Purchase'])
        predict_two=lr.predict(fold_one[features])
        mse_two=mean_squared_error(predict_two,fold_one['Purchase'])
        rmse_two=np.sqrt(mse_two)
        
        avg_rmse=np.mean([rmse_one,rmse_two])
        
        return avg_rmse
        
    else:
        kf = KFold(n_splits=k, shuffle=True)
        rmse_vals=[]
        for train_index,test_index, in kf.split(df):
            train=df.iloc[train_index]
            test=df.iloc[test_index]
            lr = linear_model.LinearRegression()
            lr.fit(train[features],train['Purchase'])       
            predict=lr.predict(test[features])
            mse=mean_squared_error(predict,test['Purchase'])
            rmse=np.sqrt(mse)
            rmse_vals.append(rmse)
        avg_rmse=np.mean(rmse_vals)
        return avg_rmse

In [50]:
train_and_test(df_train_final,10)

3621.602521704796

In [51]:
df_test=process_categories(test)
df_test=identify_missing_columns(df_test)
df_test_final=collate_prod_categories(df_test)

In [52]:
features=df_train_final.select_dtypes(include=('int64','uint8')).drop(['Purchase'],axis=1).columns
lr = linear_model.LinearRegression()
lr.fit(df_train_final[features],df_train_final['Purchase'])       
predict=lr.predict(df_test_final[features])
submissions=pd.DataFrame({'Purchase':predict,'User_ID':df_test_final['User_ID'],'Product_ID':df_test_final['Product_ID']})
submissions.to_csv('submission_0328_b.csv',index=False)

In [53]:
#Combining steps above to encapsulate it in a single function
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
def train_and_test(df,k=0):
    #features
    features=df.select_dtypes(include=('int64','uint8')).drop(['Purchase'],axis=1).columns
    
    if k==0:
        train=df[:440055]
        test=df[440055:]
        lr = linear_model.LinearRegression()
        lr.fit(train[features],train['Purchase'])
        predict=lr.predict(test[features])
        mse=mean_squared_error(predict,test['Purchase'])
        rmse=np.sqrt(mse)
        return rmse
    
    if k==1:
        df_shuffle = df.sample(frac=1,)
        fold_one=df_shuffle[:440055]
        fold_two=df_shuffle[440055:]
        model = KNeighborsRegressor(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                      weights='uniform')
        
        model.fit(fold_one[features],fold_one['Purchase'])
        predict_one=model.predict(fold_two[features])
        mse_one=mean_squared_error(predict_one,fold_two['Purchase'])
        rmse_one=np.sqrt(mse_one)
        
        model = KNeighborsRegressor(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                      weights='uniform')
        model.fit(fold_two[features],fold_two['Purchase'])
        predict_two=model.predict(fold_one[features])
        mse_two=mean_squared_error(predict_two,fold_one['Purchase'])
        rmse_two=np.sqrt(mse_two)
        
        avg_rmse=np.mean([rmse_one,rmse_two])
        
        return avg_rmse
        
    else:
        kf = KFold(n_splits=k, shuffle=True)
        rmse_vals=[]
        for train_index,test_index, in kf.split(df):
            train=df.iloc[train_index]
            test=df.iloc[test_index]
            model = KNeighborsRegressor(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                      weights='uniform')
            model.fit(train[features],train['Purchase'])       
            predict=model.predict(test[features])
            mse=mean_squared_error(predict,test['Purchase'])
            rmse=np.sqrt(mse)
            rmse_vals.append(rmse)
        avg_rmse=np.mean(rmse_vals)
        return avg_rmse

In [54]:
train_and_test(df_train_final,10)

4428.832198169276

In [55]:
def train_and_test_rf(df,k=0):
    #features
    features=df.select_dtypes(include=('int64','uint8')).drop(['Purchase'],axis=1).columns
    
    if k==0:
        train=df[:440055]
        test=df[440055:]
        model = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        max_samples=None, min_impurity_decrease=0.0,
                        min_impurity_split=None, min_samples_leaf=1,
                        min_samples_split=2, min_weight_fraction_leaf=0.0,
                        n_estimators=100, n_jobs=None, oob_score=False,
                        random_state=None, verbose=0, warm_start=False)
        model.fit(train[features],train['Purchase'])
        predict=model.predict(test[features])
        mse=mean_squared_error(predict,test['Purchase'])
        rmse=np.sqrt(mse)
        return rmse
    
    if k==1:
        df_shuffle = df.sample(frac=1,)
        fold_one=df_shuffle[:440055]
        fold_two=df_shuffle[440055:]
        model = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        max_samples=None, min_impurity_decrease=0.0,
                        min_impurity_split=None, min_samples_leaf=1,
                        min_samples_split=2, min_weight_fraction_leaf=0.0,
                        n_estimators=100, n_jobs=None, oob_score=False,
                        random_state=None, verbose=0, warm_start=False)
        
        model.fit(fold_one[features],fold_one['Purchase'])
        predict_one=model.predict(fold_two[features])
        mse_one=mean_squared_error(predict_one,fold_two['Purchase'])
        rmse_one=np.sqrt(mse_one)
        
        model = KNeighborsRegressor(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                      weights='uniform')
        model.fit(fold_two[features],fold_two['Purchase'])
        predict_two=model.predict(fold_one[features])
        mse_two=mean_squared_error(predict_two,fold_one['Purchase'])
        rmse_two=np.sqrt(mse_two)
        
        avg_rmse=np.mean([rmse_one,rmse_two])
        
        return avg_rmse
        
    else:
        kf = KFold(n_splits=k, shuffle=True)
        rmse_vals=[]
        for train_index,test_index, in kf.split(df):
            train=df.iloc[train_index]
            test=df.iloc[test_index]
            model = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        max_samples=None, min_impurity_decrease=0.0,
                        min_impurity_split=None, min_samples_leaf=1,
                        min_samples_split=2, min_weight_fraction_leaf=0.0,
                        n_estimators=100, n_jobs=None, oob_score=False,
                        random_state=None, verbose=0, warm_start=False)
            model.fit(train[features],train['Purchase'])       
            predict=model.predict(test[features])
            mse=mean_squared_error(predict,test['Purchase'])
            rmse=np.sqrt(mse)
            rmse_vals.append(rmse)
        avg_rmse=np.mean(rmse_vals)
        return avg_rmse

In [59]:
#train_and_test_rf(df_train_final,10)
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score

In [None]:
features=df_train_final.select_dtypes(include=('int64','uint8')).drop(['Purchase'],axis=1).columns
all_X=df_train_final[features]
all_y=df_train_final['Purchase']
train_X,test_X,train_y,test_y = train_test_split(all_X,all_y,test_size=.2,random_state=0)
model = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        max_samples=None, min_impurity_decrease=0.0,
                        min_impurity_split=None, min_samples_leaf=1,
                        min_samples_split=2, min_weight_fraction_leaf=0.0,
                        n_estimators=100, n_jobs=None, oob_score=False,
                        random_state=None, verbose=0, warm_start=False)
model.fit(train_X[features],train_y)
predictions=model.predict(test_X[features])
single_accuracy=model.score(test_X[features],test_y)
scores=cross_val_score(model,all_X,all_y,cv=10)
cross_validated_accuracy = np.mean(scores)

In [None]:
print(single_accuracy)
print(cross_validated_accuracy)