# Models

  - RandomForest
  - KNN
  - SVM
  - GaussianNB
  - Logistic Regression
  

# import pacakge

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import compose
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

### Get data

In [None]:
aisles = pd.read_csv('data/aisles.csv')
departments = pd.read_csv('data/departments.csv')
order_prior = pd.read_csv('data/order_products__prior.csv')
order_train = pd.read_csv('data/order_products__train.csv')
orders = pd.read_csv('data/orders.csv')
products = pd.read_csv('data/products.csv')b

In [None]:
products = products.merge(aisles, how = 'left', left_on = 'aisle_id', right_on = 'aisle_id')

In [None]:
products = products.merge(departments, how = 'left', left_on = 'department_id', right_on = 'department_id')

In [None]:
df = pd.concat([order_prior, order_train], axis = 0)

In [None]:
df = df.merge(products, how = 'left', left_on = 'product_id', right_on = 'product_id')

In [None]:
df = df.merge(orders, left_on = 'order_id', right_on = 'order_id', how = 'left')

In [None]:
df.to_csv('instacart.csv', index = False)

In [4]:
df = pd.read_csv('instacart.csv')

In [3]:
df = df.dropna() 

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33819106 entries, 0 to 33819105
Data columns (total 15 columns):
order_id                  int64
product_id                int64
add_to_cart_order         int64
reordered                 int64
product_name              object
aisle_id                  int64
department_id             int64
aisle                     object
department                object
user_id                   int64
eval_set                  object
order_number              int64
order_dow                 int64
order_hour_of_day         int64
days_since_prior_order    float64
dtypes: float64(1), int64(10), object(4)
memory usage: 3.8+ GB


In [6]:
df.shape

(33819106, 15)

### Feature engineering 

In [7]:
def data_clean(df):
    """
      This is a data cleaning function for feature enginnering. We did 3 steps of feature engineering
      including creating label for weekend or non weekend orders; if the department is staple; if the product 
      id organic
    """
    
    print("Getting dummies...")
    df = pd.get_dummies(df, columns=['department_id'], drop_first=True)
    print("Creating weekend flag...")
    df['weekend'] = 0
    df.loc[(df['order_dow'] == 5) | (df['order_dow'] == 6),'weekend'] = 1
    
    # Department staples
    print('Encoding dept staples...')
    staples = {
        'dairy eggs' : 1,
        'produce' : 1,
        'pantry' : 0,
        'meat seafood' : 1,
        'bakery' : 0,
        'personal care' : 0,
        'snacks' : 0,
        'breakfast' : 0,
        'beverages' : 0,
        'deli' : 0,
        'household' : 0,
        'international' : 0,
        'dry goods pasta' : 1,
        'frozen' : 0,
        'canned goods' : 0,
        'babies' : 0,
        'pets' : 0,
        'alcohol' : 0,
        'bulk' : 0,
        'missing' : 0,
        'other' : 0
    }
    df.loc[:, 'department_is_staple'] = df['department'].apply(lambda x: staples[x])
    
    print('Encoding organic products...')
    df.loc[:, 'product_is_organic'] = 0
    df.loc[df.product_name.apply(lambda x: 'organic' in x.lower()), 'product_is_organic'] = 1
    
    print('Dropping columns...')
    df.drop(['product_name', 'aisle', 'department'], axis=1, inplace=True)
    
    
    
    return df

### Train-test split :     Stratified shuffle split

Due to the huge number of rows (~37M), we gonna use stratifying to test our randomized search code.
Perform stratified train-test split of our dataset on a subset of the data.

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit

X = data_clean(df.drop(['reordered', 'eval_set'], axis=1))  # full X
y = df['reordered']  # full y
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.99, random_state=42)
sss.get_n_splits(X, y)

Getting dummies...
Creating weekend flag...
Encoding dept staples...
Encoding organic products...
Dropping columns...


5

In [9]:
for train_index, test_index in sss.split(X, y):
    print(train_index)

[ 8595659 33815089 26713025 ... 24987894 20821339   807458]
[16737083 25838276 23945076 ...  4254900 10900318 27134643]
[22442783 14002905 12578098 ...  6344823 23684632  8900077]
[31941577  1968456 33654354 ... 10040704   357393 17371561]
[28203419 11990394 21677894 ...  3675868 30929476 29800182]


In [10]:
X_train = X.iloc[train_index, :]
y_train = y.iloc[train_index]
X_test = X.iloc[test_index, :]
y_test = y.iloc[test_index]

In [11]:
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

In [13]:
print(f'X_train: {X_train.shape}')
print(f'X_test: {X_test.shape}')

X_train: (338191, 32)
X_test: (33480915, 32)


# pipelines

In [17]:
def make_pipelines():
    """Create a pipeline for each of the following algorithms:
    1. Logistic Regression
    2. k-nearest neighbors (KNN) 
    3. Naive Bayes (Guassian)
    4. Support Vector Machines (SVM)
    
    
    If appropriate, apply StandardScaler before the algorithm.   
    Use default hyperparameters.
    
    Return a list of all the pipelines.
    We did not include Random Forest in the pipeline due to the volume of our data so we have to run it seperately,
    we also intend to do random search for random forest so we will do that seperately
    
    """ 
    numeric_features=X_train.columns
    
    numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median'))])
    
    preprocessor = compose.ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features)])
    
    LR=Pipeline([('preprocessor', preprocessor),('LogisticRegression', LogisticRegression(random_state=42))])
    KNN=Pipeline([('preprocessor', preprocessor),('scaler', StandardScaler()),('KNeighborsClassifier', KNeighborsClassifier())])
    NB=Pipeline([('preprocessor', preprocessor),('GaussianNB', GaussianNB())])
    SVM=Pipeline([('preprocessor', preprocessor),('scaler', StandardScaler()), ('SVC', SVC(random_state=42))])
    
    

    pipelines=[LR,KNN,NB,SVM]
    return pipelines

In [18]:
pipelines = make_pipelines()

In [None]:
for pipe in pipelines:
    pipe.fit(X_train, y_train)



In [None]:
preds = pipe.predict(x_test)

### RandomizedSearch on RandomForest

In [14]:
from sklearn.model_selection import RandomizedSearchCV

In [15]:
def make_random_cv(estimator, param_distr, cv=3, n_iter=5):
    random_cv = RandomizedSearchCV(
                    estimator=estimator,
                    cv=cv,
                    param_distributions=param_distr,
                    n_iter=n_iter
                )
    return random_cv

In [16]:
rf_est = RandomForestClassifier(random_state=42)
param_distr = {
    'n_estimators' : [25, 50, 100, 200],
    'max_features' : [i for i in range(1, 8)],
    'min_samples_leaf' : [200, 500, 1500, 5000]
}
rf_random_cv = make_random_cv(rf_est, param_distr)

# random forest

**Fit and predict using the best estimator returned from randomsearch**

In [262]:
for train_index, test_index in sss.split(X, y):
    
    X_train = X.iloc[train_index, :]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index, :]
    y_test = y.iloc[test_index]
    
    assert X_train.shape[0] == y_train.shape[0]
    assert X_test.shape[0] == y_test.shape[0]
    
    # Fit on this X_train, y_train split
    print("Fitting...")
    rf_random_cv.fit(X_train, y_train)
    print(rf_random_cv.best_params_)
    
    # Predict using best estimator, evaluate model
    print("Predicting...")
    y_pred = rf_random_cv.best_estimator_.predict(X_test)
    print(f1_score(y_test, y_pred, average='binary'))

Fitting...
{'n_estimators': 100, 'min_samples_leaf': 500, 'max_features': 6}
Predicting...
0.788834656257673
Fitting...
{'n_estimators': 25, 'min_samples_leaf': 200, 'max_features': 7}
Predicting...
0.7928891213005266
Fitting...
{'n_estimators': 200, 'min_samples_leaf': 1500, 'max_features': 6}
Predicting...
0.7852710803433028
Fitting...
{'n_estimators': 50, 'min_samples_leaf': 200, 'max_features': 3}
Predicting...
0.7924978687127026
Fitting...
{'n_estimators': 25, 'min_samples_leaf': 500, 'max_features': 7}
Predicting...
0.7873987266862479


In [246]:
# X_train = X.iloc[train_index, :]
# y_train = y.iloc[train_index]
# X_test = X.iloc[test_index, :]
# y_test = y.iloc[test_index]

In [247]:
# assert X_train.shape[0] == y_train.shape[0]
# assert X_test.shape[0] == y_test.shape[0]

In [248]:
print(f'X_train: {X_train.shape}')
print(f'X_test: {X_test.shape}')

X_train: (317410, 13)
X_test: (31423628, 13)


In [253]:
for i in range(1):
    rf_random_cv.fit(X_train, y_train)
    print(rf_random_cv.best_params_)
    y_pred = rf_random_cv.best_estimator_.predict(X_train)
    print(f1_score(y_train, y_pred, average='binary'))

{'n_estimators': 25, 'min_samples_leaf': 200, 'max_features': 4}
0.7920215469753042


In [257]:
y_pred = rf_random_cv.best_estimator_.predict(X_test)
print(f1_score(y_test, y_pred, average='binary'))

KeyboardInterrupt: 

In [None]:
# train_subset_size = 0.001
# test_subset_size = 0.001

# train_df = train_df.sample(frac=train_subset_size, replace=False, random_state=42)
# test_df = test_df.sample(frac=test_subset_size, replace=False, random_state=42)

# x_train = data_clean(train_df.drop(['reordered', 'eval_set'], axis=1))
# x_test = data_clean(test_df.drop(['reordered', 'eval_set'], axis=1))

# y_train = train_df['reordered']
# y_test = test_df['reordered']