<a href="https://colab.research.google.com/github/vincentcommere/IFT6758-Kaggle-Data-Challenge/blob/main/final_de_DS_challenge_code_submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# README :

- Open in Colab *(Html titles wont appears in the Markdown reader of GitHub)*
- Update your project path drive in the **Environment Set-up** section
- Run all cells of **Usefull functions** section
- Run all cells of **Data cleaning & Feature engineering Classes** section
- Run all cells of **Model - 1** section
- You will get the **curr_submission.csv** submission file

#**Environment Set-up**


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


<font color='red'>**Feel free to change the following path base on your own Drive**</font>

In [None]:
cp -R /content/drive/MyDrive/02_ETUDES/01-MILA-UDEM/IFT6758/Kaggle/* .

In [None]:
!unzip -qq ift6758-a20.zip

replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A


### _Librairies Installations_

In [None]:
!pip install catboost
!pip install lightgbm
!pip install scikit-image
!pip install opencv-python



### _Globales Variables_

In [None]:
train_csv_path = 'train.csv'
test_csv_path = 'test.csv'
cluster_csv_path = 'cluster_features.npy'

## *Libraries Imports*


In [None]:
import csv

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import PowerTransformer
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor, 
    StackingRegressor, BaggingRegressor,
)
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import mean_squared_log_error


# <font color=''>**Usefull functions**</font>

### *Evaluation metric function*

In [None]:
def rmsle(y_true, y_pred):
    """Calculate RMSLE for predictions"""
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

### *Compute cross validation function*

In [None]:
def get_cv_predictions(model, X_train, y_train, n_splits=5, shuffle=True):
    cv_fold = KFold(n_splits=5, random_state=42, shuffle=True)
    return cross_val_predict(model, X_train, y_train, cv=cv_fold)

# <font color=''>**Data cleaning & Feature engineering Classes**</font>

### _Imputer Class_

In [None]:
class Imputer:
    @staticmethod
    def _fillna(df, col_name, value, na_symbol=None):
        if na_symbol:
            df[col_name].replace(na_symbol, np.NaN, inplace=True)
        df[col_name].fillna(value, inplace=True)

    def impute_data(self, df, train):
        if train:
            df.dropna(subset=['num_of_profile_likes'], axis=0, inplace=True)
        self._fillna(df, col_name='profile_cover_image_status', value='Not set')
        self._fillna(df, col_name='is_profile_view_size_customized?', value=False)
        self._fillna(df, col_name='location_public_visibility', value='disabled', na_symbol='??')
        self._fillna(df, col_name='profile_category', value='unknown', na_symbol=' ')
        self._fillna(df, col_name='avg_daily_profile_visit_duration_in_seconds', value=df['avg_daily_profile_visit_duration_in_seconds'].median())
        self._fillna(df, col_name='avg_daily_profile_clicks', value=df['avg_daily_profile_clicks'].median())

        self._fillna(df, col_name='utc_offset', value=df.groupby(['utc_offset', 'location'])['utc_offset'].transform('max'))
        self._fillna(df, col_name='utc_offset', value=0)
        return df


### _Feature Engineering Class_

In [None]:
class FeatureEngineer:
    scaler = None

    def __init__(self, scaler=None):
        self.scaler = scaler or preprocessing.StandardScaler()

    def process_data(self, df, train):
      
        df['has_personal_url'] = df['personal_url'].notna().astype(int)
        df['cover_image_set'] = df['profile_cover_image_status'].apply(lambda x: 1 if x == 'Set' else 0)
        df['view_size_customized'] = df['is_profile_view_size_customized?'].astype(int)
        df['is_location_visible'] = df['location_public_visibility'].apply(lambda x: 1 if x.lower() == 'enabled' else 0)

        df = df.join(self._encode_category(df, 'profile_verification_status', prefix='status'))
        df = df.join(self._encode_category(df, 'profile_category', prefix='category'))
        df = df.join(self._encode_category(df, 'user_language', prefix='lang'))
        df['creation_year'] = df['profile_creation_timestamp'].apply(lambda x: x.year)
        df['creation_year'] = df['creation_year'] - df['creation_year'].min()

        df = self._normalize_columns(df, col_names=[
            'utc_offset', 'num_of_followers', 'num_of_people_following',
            'num_of_status_updates', 'num_of_direct_messages',
            'avg_daily_profile_visit_duration_in_seconds', 'avg_daily_profile_clicks',
        ], train=train)

        self._drop_columns(df, col_names=[
            'id', 'user_name', 'personal_url', 'profile_cover_image_status', 
            'is_profile_view_size_customized?', 'location_public_visibility', 
            'profile_verification_status', 'profile_text_color', 'profile_page_color', 
            'profile_theme_color',  'location', 'user_time_zone', 'profile_category', 
            'profile_image', 'profile_creation_timestamp', 'user_language',
        ])
        return df

    def _normalize_columns(self, df, col_names, train):
        if train:
            self.scaler.fit(df[col_names].values)
        df.update(pd.DataFrame(self.scaler.transform(df[col_names].values), columns=col_names))
        return df

    @staticmethod
    def _encode_category(df, col_name, target_encoding=False, prefix='profile'):
        df[col_name] = df[col_name].apply(lambda x: x.replace(' ', '_').lower())
        # TODO: change to sklearn encoders
        return pd.get_dummies(df[col_name], prefix=prefix)

    @staticmethod
    def _drop_columns(df, col_names):
        df.drop(col_names, axis=1, inplace=True)


### _Data Processing Class_

In [None]:
class DataPreprocessor:
    imputer = None
    feature_eng = None

    def __init__(self, imputer=None, feature_eng=None):
        self.imputer = imputer or Imputer()
        self.feature_eng = feature_eng or FeatureEngineer()

    def preprocess(self, df, train=False):
        df = self._process_column_names(df)
        df = self._handle_missing_values(df, train=train)
        df = self._engineer_features(df, train=train)
        return df

    @staticmethod
    def _process_column_names(df):
        """Convert column names to follow variables notation of python for convenience"""
        return df.rename(columns=lambda x: x.replace(' ', '_').lower())

    def _handle_missing_values(self, df, train):
        return self.imputer.impute_data(df, train=train)

    def _engineer_features(self, df, train):
        return self.feature_eng.process_data(df, train=train)


# <font color=''>**Data Preparation 1**

In [None]:
if __name__ == '__main__':
  

    numeric_cols = [
    'utc_offset', 'num_of_followers', 'num_of_people_following', 'num_of_status_updates', 
    'num_of_direct_messages', 'avg_daily_profile_visit_duration_in_seconds', 
    'avg_daily_profile_clicks'
    ]

    df_train = pd.read_csv(train_csv_path, parse_dates=['Profile Creation Timestamp'])
    df_test = pd.read_csv(test_csv_path, parse_dates=['Profile Creation Timestamp'])
    test_id = df_test['Id'].copy()

    preprocessor = DataPreprocessor()
    df_train = preprocessor.preprocess(df_train, train=True)

    ### ----- PARTS TO ADD INSIDE PREPROCESSOR --------
    pow_transform = PowerTransformer()
    df_train[numeric_cols] = pow_transform.fit_transform(df_train[numeric_cols])

    df_train['num_of_profile_likes'] = df_train['num_of_profile_likes'].apply(np.log1p)
    X_train, y_train = df_train.loc[:, df_train.columns != 'num_of_profile_likes'], df_train['num_of_profile_likes']

    df_test = preprocessor.preprocess(df_test, train=False)
    # TODO: Need to add a label encoder for language
    # instead of pd.dummies() as done in current preprocessing
    df_test['lang_el'] = 0
    df_test['lang_sk'] = 0
    df_test['lang_uk'] = 0
    df_test['lang_sr'] = 0
    df_test['lang_zh-tw'] = 0
    df_test['lang_da'] = 0
    df_test.drop(columns=['lang_he', 'lang_no', 'lang_ro'], inplace=True)
    df_test[numeric_cols] = pow_transform.transform(df_test[numeric_cols])
    df_test = df_test[X_train.columns]
    X_test = df_test

    feat = np.load(cluster_csv_path, allow_pickle=True).tolist()
    res = pd.get_dummies(feat[6]['train_features'], prefix='cluster')
    for col in res.columns:
        X_train[col] = res[col]
    res = pd.get_dummies(feat[6]['test_features'], prefix='cluster')
    for col in res.columns:
        X_test[col] = res[col]

# <font color=''>**Model - 1 : Ensemble Method with clustering**

- Submission #1 (best submitted private result on Kaggle)

## _Models Parameters Definition_

In [None]:
params = {
  'rf_reg': {
  'n_estimators': 700,
  'max_depth': 6,
  'min_samples_split': 5,
  'min_samples_leaf': 5,
  'max_features': None,
  'oob_score': True,
  'random_state': 42
  },
  'svr': {
  'C': 20, 
  'epsilon': 0.008, 
  'gamma': 0.0003
  },
  'bagging_reg': {
  'n_estimators': 500, 
  'max_samples': 0.7, 
  'oob_score': True
  },
  'gb_reg': {
  'n_estimators': 700,
  'learning_rate': 0.01,
  'max_depth': 7,
  'max_features': 'sqrt',
  'min_samples_leaf': 5,
  'min_samples_split': 10,
  'loss': 'huber',
  'random_state': 42
  },
  'xgb_reg': {
  'learning_rate': 0.01,
  'n_estimators': 1000,
  'max_depth': 5,
  'gamma': 0.6,
  'subsample': 0.8,
  'colsample_bytree': 0.8,
  'objective': 'reg:squarederror',
  'nthread': -1,
  'scale_pos_weight': 1,
  'seed': 27,
  'reg_alpha': 0.00006,
  'random_state': 42
  },
}

### _Models definitions_

In [None]:
m_rf = RandomForestRegressor(**params['rf_reg'])
m_svr = SVR(**params['svr'])
m_bag = BaggingRegressor(**params['bagging_reg'])
m_gbr = GradientBoostingRegressor(**params['gb_reg'])
m_xgbr = XGBRegressor(**params['xgb_reg'])

### _Models Stacking_

In [None]:
m_ensemble = StackingRegressor(
  estimators=[
    ('random_forest', m_rf),
    ('SVR', m_svr),
    ('bagging', m_bag),
    ('grad_boosting', m_gbr),
    ('xgboost', m_xgbr),
  ],
  n_jobs=-1)

### _Models Testing_

In [None]:
cv_prediction = get_cv_predictions(m_ensemble, X_train, y_train, n_splits=5, shuffle=True)
cv_prediction = np.abs(cv_prediction)
print(rmsle(np.expm1(y_train), np.expm1(cv_prediction)))



1.7058835584148224


### _Stacking Model - Fit & Predict_

In [None]:
m_ensemble.fit(X_train, y_train)
pred = m_ensemble.predict(X_test)
pred = np.abs(pred)

### *Submission file - 1*

In [None]:
# pred.to_csv('curr_submission.csv',['Id', 'Predicted'])
with open('curr_submission.csv', 'w') as csvfile:
    csvwriter = csv.writer(csvfile)  
    csvwriter.writerow(['Id', 'Predicted'])
    for i in range(len(pred)):
        csvwriter.writerow([test_id.iloc[i], np.expm1(pred[i])])

# <font color=''>**Data Preparation 2**

In [None]:
if __name__ == '__main__':
  

    numeric_cols = [
    'utc_offset', 'num_of_followers', 'num_of_people_following', 'num_of_status_updates', 
    'num_of_direct_messages', 'avg_daily_profile_visit_duration_in_seconds', 
    'avg_daily_profile_clicks'
    ]

    df_train = pd.read_csv(train_csv_path, parse_dates=['Profile Creation Timestamp'])
    df_test = pd.read_csv(test_csv_path, parse_dates=['Profile Creation Timestamp'])
    test_id = df_test['Id'].copy()

    preprocessor = DataPreprocessor()
    df_train = preprocessor.preprocess(df_train, train=True)

    ### ----- PARTS TO ADD INSIDE PREPROCESSOR --------
    pow_transform = PowerTransformer()
    df_train[numeric_cols] = pow_transform.fit_transform(df_train[numeric_cols])

    df_train['num_of_profile_likes'] = df_train['num_of_profile_likes'].apply(np.log1p)
    X_train, y_train = df_train.loc[:, df_train.columns != 'num_of_profile_likes'], df_train['num_of_profile_likes']

    df_test = preprocessor.preprocess(df_test, train=False)
    # TODO: Need to add a label encoder for language
    # instead of pd.dummies() as done in current preprocessing
    df_test['lang_el'] = 0
    df_test['lang_sk'] = 0
    df_test['lang_uk'] = 0
    df_test['lang_sr'] = 0
    df_test['lang_zh-tw'] = 0
    df_test['lang_da'] = 0
    df_test.drop(columns=['lang_he', 'lang_no', 'lang_ro'], inplace=True)
    df_test[numeric_cols] = pow_transform.transform(df_test[numeric_cols])
    df_test = df_test[X_train.columns]
    X_test = df_test



# **Model - 2 : Ensemble Method only**


- Submission #2 (2nd best submitted private result on Kaggle)

### _Models Parameters Definition_

In [None]:
params = {
        'rf_reg': {
            'n_estimators': 700,
            'max_depth': 6,
            'min_samples_split': 5,
            'min_samples_leaf': 5,
            'max_features': None,
            'oob_score': True,
            'random_state': 42
        },
        'svr': {
            'C': 20, 
            'epsilon': 0.008, 
            'gamma': 0.0003
        },
        'bagging_reg': {
            'n_estimators': 500, 
            'max_samples': 0.7, 
            'oob_score': True
        },
        'gb_reg': {
            'n_estimators': 700,
            'learning_rate': 0.01,
            'max_depth': 7,
            'max_features': 'sqrt',
            'min_samples_leaf': 5,
            'min_samples_split': 10,
            'loss': 'huber',
            'random_state': 42
        },
        'xgb_reg': {
            'learning_rate': 0.01,
            'n_estimators': 1000,
            'max_depth': 5,
            'gamma': 0.6,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'objective': 'reg:squarederror',
            'nthread': -1,
            'scale_pos_weight': 1,
            'seed': 27,
            'reg_alpha': 0.00006,
            'random_state': 42
        },
    }

### _Models definitions_

In [None]:
m_rf = RandomForestRegressor(**params['rf_reg'])
m_svr = SVR(**params['svr'])
m_bag = BaggingRegressor(**params['bagging_reg'])
m_gbr = GradientBoostingRegressor(**params['gb_reg'])
m_xgbr = XGBRegressor(**params['xgb_reg'])

### _Models Stacking_

In [None]:
m_ensemble = StackingRegressor(
    estimators=[
    ('random_forest', m_rf),
    ('SVR', m_svr),
    ('bagging', m_bag),
    ('grad_boosting', m_gbr),
    ('xgboost', m_xgbr),
  ],
  n_jobs=-1)

### _Models Testing_

In [None]:
cv_prediction = get_cv_predictions(m_ensemble, X_train, y_train, n_splits=5, shuffle=True)
cv_prediction = np.abs(cv_prediction)
print(rmsle(np.expm1(y_train), np.expm1(cv_prediction)))



1.7059227078154642


### _Stacking Model - Fit & Predict_

In [None]:
m_ensemble.fit(X_train, y_train)
pred = m_ensemble.predict(X_test)
pred = np.abs(pred)

## *Submission file 2*

In [None]:
with open('curr_submission2.csv', 'w') as csvfile:
    csvwriter = csv.writer(csvfile)  
    csvwriter.writerow(['Id', 'Predicted'])

    for i in range(len(pred)):
        csvwriter.writerow([test_id.iloc[i], np.expm1(pred[i])])


# **Model - 3**


- Submission #3 (3rd best submitted private result on Kaggle)

In [None]:
import csv
import math
import pdb

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, 
    VotingRegressor, StackingRegressor, BaggingRegressor,
)
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, LinearSVR
from xgboost import XGBRegressor
import seaborn as sns
from  matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn import linear_model
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from scipy.special import boxcox1p, inv_boxcox1p
from scipy.stats import boxcox_normmax
from scipy.stats import skew, norm
from scipy.stats import yeojohnson
from sklearn.preprocessing import PowerTransformer
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

pd.set_option('display.max_columns', None)

numeric_cols = [
    'utc_offset', 'num_of_followers', 'num_of_people_following', 'num_of_status_updates', 
    'num_of_direct_messages', 'avg_daily_profile_visit_duration_in_seconds', 
    'avg_daily_profile_clicks'
]


In [None]:
class Imputer:
    @staticmethod
    def _fillna(df, col_name, value, na_symbol=None):
        if na_symbol:
            df[col_name].replace(na_symbol, np.NaN, inplace=True)
        df[col_name].fillna(value, inplace=True)

    def impute_data(self, df, train):
        if train:
            df.dropna(subset=['num_of_profile_likes'], axis=0, inplace=True)
        self._fillna(df, col_name='profile_cover_image_status', value='Not set')
        self._fillna(df, col_name='is_profile_view_size_customized?', value=False)
        self._fillna(df, col_name='location_public_visibility', value='disabled', na_symbol='??')
        self._fillna(df, col_name='profile_category', value='unknown', na_symbol=' ')
        self._fillna(df, col_name='avg_daily_profile_visit_duration_in_seconds', value=df['avg_daily_profile_visit_duration_in_seconds'].median())
        self._fillna(df, col_name='avg_daily_profile_clicks', value=df['avg_daily_profile_clicks'].median())

        self._fillna(df, col_name='utc_offset', value=df.groupby(['utc_offset', 'location'])['utc_offset'].transform('max'))
        self._fillna(df, col_name='utc_offset', value=0)
        return df

In [None]:
class FeatureEngineer:
    scaler = None

    def __init__(self, scaler=None):
        self.scaler = scaler or preprocessing.StandardScaler()

    def process_data(self, df, train):
        df['has_personal_url'] = df['personal_url'].notna().astype(int)
        df['cover_image_set'] = df['profile_cover_image_status'].apply(lambda x: 1 if x == 'Set' else 0)
        df['view_size_customized'] = df['is_profile_view_size_customized?'].astype(int)
        df['is_location_visible'] = df['location_public_visibility'].apply(lambda x: 1 if x.lower() == 'enabled' else 0)

        df = df.join(self._encode_category(df, 'profile_verification_status', prefix='status'))
        df = df.join(self._encode_category(df, 'profile_category', prefix='category'))
        df = df.join(self._encode_category(df, 'user_language', prefix='lang'))
        df = df.join(self._encode_category(df, 'cluster', prefix='cluster'))
        df['creation_year'] = df['profile_creation_timestamp'].apply(lambda x: x.year)
        df['creation_year'] = df['creation_year'] - df['creation_year'].min()

        df = self._normalize_columns(df, col_names=[
            'utc_offset', 'num_of_followers', 'num_of_people_following',
            'num_of_status_updates', 'num_of_direct_messages',
            'avg_daily_profile_visit_duration_in_seconds', 'avg_daily_profile_clicks',
        ], train=train)

        self._drop_columns(df, col_names=[
            'id', 'user_name', 'personal_url', 'profile_cover_image_status', 
            'is_profile_view_size_customized?', 'location_public_visibility', 
            'profile_verification_status', 'profile_text_color', 'profile_page_color', 
            'profile_theme_color',  'location', 'user_time_zone', 'profile_category', 
           'profile_image', 'profile_creation_timestamp', 'user_language', 'cluster',
        ])
        return df

    @staticmethod
    def _drop_columns(df, col_names):
        df.drop(col_names, axis=1, inplace=True)

    def _normalize_columns(self, df, col_names, train):
        if train:
            self.scaler.fit(df[col_names].values)
        df.update(pd.DataFrame(self.scaler.transform(df[col_names].values), columns=col_names))
        return df

    @staticmethod
    def _encode_category(df, col_name, target_encoding=False, prefix='profile'):
        df[col_name] = df[col_name].apply(lambda x: x.replace(' ', '_').lower())
        return pd.get_dummies(df[col_name], prefix=prefix)

In [None]:
class DataPreprocessor:
    imputer = None
    feature_eng = None

    def __init__(self, imputer=None, feature_eng=None):
        self.imputer = imputer or Imputer()
        self.feature_eng = feature_eng or FeatureEngineer()

    def preprocess(self, df, train=False):
        df = self._process_column_names(df)
        df = self._handle_missing_values(df, train=train)
        df = self._engineer_features(df, train=train)
        return df

    @staticmethod
    def _process_column_names(df):
        return df.rename(columns=lambda x: x.replace(' ', '_').lower())

    def _handle_missing_values(self, df, train):
        return self.imputer.impute_data(df, train=train)

    def _engineer_features(self, df, train):
        return self.feature_eng.process_data(df, train=train)

### Image feature extraction

In [None]:

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from PIL import Image
import pandas as pd
import numpy as np
import glob, os, time, copy
import torch.optim as optim
from torchvision import datasets, transforms
import os
from PIL import Image
import glob
import numpy as np
from torch.utils.data import Dataset
from torchvision import transforms
import librosa
from random import shuffle
from skimage.io import imread
from sklearn.cluster import KMeans

norm_dict = {
'normalize_torch' : transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225]
),
'normalize_05' : transforms.Normalize(
    mean=[0.5, 0.5, 0.5],
    std=[0.5, 0.5, 0.5]
)
}

class Read_dataset(Dataset):

    def __init__(self, img_list, img_size=224, normalize_fun='normalize_torch'):
        self.data = img_list
        self.transform = transforms.Compose([

            transforms.Resize((img_size,img_size), interpolation=2),
            transforms.ToTensor(),
            norm_dict[normalize_fun]
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        
        img= imread(self.data[i])
        
        image = self.transform(Image.fromarray(img))

        return image, self.data[i]

class feature_model(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = models.resnet50(pretrained=True)
        self.feature = nn.Sequential(*list(self.model.children())[:-1])
    # Set your own forward pass
    def forward(self, x):
        x = self.feature(x).view(-1, 2048)
        return x # dictionary with the outputs from the 512 classifiers


img_list = glob.glob('/content/train_profile_images/profile_images_train/*.png')
test_img = glob.glob('/content/test_profile_images/profile_images_test/*.png')
# print(len(img_list))
train_dataset = Read_dataset(img_list)
training_data_loader = DataLoader(dataset=train_dataset, num_workers=8,
                                  batch_size=16,
                                  shuffle=True)
test_dataset = Read_dataset(test_img)
test_data_loader = DataLoader(dataset=test_dataset, num_workers=8,
                                  batch_size=16,
                                  shuffle=True)

d = {}
val = np.zeros((7500, 2048))
test_val = np.zeros((2500, 2048))

count = 0
x = feature_model().cuda()
for i , j in training_data_loader:
  i = i.cuda()
  out = x(i)
  out = out.data.cpu().numpy()
  for num in range(len(j)):
    key = j[num].split('/')[-1]
    if key in d:
      print('error')
    d[key] = out[num]
    val[count] = out[num]
    count += 1

count = 0
# x = feature_model().cuda()
for i , j in test_data_loader:
  i = i.cuda()
  out = x(i)
  out = out.data.cpu().numpy()
  for num in range(len(j)):
    key = j[num].split('/')[-1]
    if key in d:
      print('error')
    d[key] = out[num]
    test_val[count] = out[num]
    count += 1
print('+ done ')

n = 6
kmeans = KMeans(n_clusters=n, random_state=0).fit(val)
label = kmeans.labels_

In [None]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))


def clip_columns(df):
    cols = ['Num of Followers', 'Num of People Following', 'Num of Status Updates', 'Num of Direct Messages']
    q = df[cols].quantile([0.25, 0.95])
    for col in cols:
        df[col] = df[col].clip(*q[col])
    return df

def img_cluster(kmeans_model, feature_dict, img_key):
    value = feature_dict[img_key]
    cluster = kmeans_model.predict(np.expand_dims(value,axis=0))
    return cluster[0]

def cluster_update(kmeans_model, feature_dict, df):
    x = df['Profile Image'].values
    lab =[]
    # print(x)
    for i in x :
        # print(i)
        lab.append(str(img_cluster(kmeans_model, feature_dict, i)))
    return lab



if __name__ == '__main__':
    df_train = pd.read_csv('./train.csv', parse_dates=['Profile Creation Timestamp'])
    df_test = pd.read_csv('./test.csv', parse_dates=['Profile Creation Timestamp'])

    df_train['cluster'] = cluster_update(kmeans, d, df_train)
    df_test['cluster'] = cluster_update(kmeans, d, df_test)

    test_id = df_test['Id'].copy()

    # df_train = clip_columns(df_train)
    # df_test = clip_columns(df_test)

    preprocessor = DataPreprocessor()
    df_train = preprocessor.preprocess(df_train, train=True)

    pow_transform = PowerTransformer()
    df_train[numeric_cols] = pow_transform.fit_transform(df_train[numeric_cols])

    df_train['num_of_profile_likes'] = df_train['num_of_profile_likes'].apply(np.log1p)
    X, y = df_train.loc[:, df_train.columns != 'num_of_profile_likes'], df_train['num_of_profile_likes']

    df_test = preprocessor.preprocess(df_test, train=False)
    df_test['lang_el'] = 0
    df_test['lang_sk'] = 0
    df_test['lang_uk'] = 0
    df_test['lang_sr'] = 0
    df_test['lang_zh-tw'] = 0
    df_test['lang_da'] = 0
    df_test.drop(columns=['lang_he', 'lang_no', 'lang_ro'], inplace=True)
    df_test[numeric_cols] = pow_transform.transform(df_test[numeric_cols])
    df_test = df_test[X.columns]
    test_data = df_test
    


### Model Selection

In [None]:
from sklearn.metrics import mean_squared_log_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold

def run_gradient_boosting(clf, fit_params, train, target, test):
  N_SPLITS = 5
  oofs = np.zeros(len(train))
  preds = np.zeros((len(test)))

  # target = train[TARGET_COL]
  features = list(train.columns.values)
  folds = StratifiedKFold(n_splits = N_SPLITS)
  stratified_target = pd.qcut(target, 10, labels = False, duplicates='drop')

  feature_importances = pd.DataFrame()

  for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
    print(f'\n------------- Fold {fold_ + 1} -------------')

    ### Training Set
    X_trn, y_trn = train.iloc[trn_idx], target.iloc[trn_idx]

    ### Validation Set
    X_val, y_val = train.iloc[val_idx], target.iloc[val_idx]
    
    _ = clf.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], **fit_params)
    fold_importance = pd.DataFrame({'fold': fold_ + 1, 'feature': features, 'importance': clf.feature_importances_})
    feature_importances = pd.concat([feature_importances, fold_importance], axis=0)

    ### Instead of directly predicting the classes we will obtain the probability of positive class.
    preds_val = clf.predict(X_val)
    # pred_val[pred_val<0] = 0
    preds_test = clf.predict(test)

    fold_score = av_metric(y_val, preds_val)
    print(f'\nAV metric score for validation set is {fold_score}')

    oofs[val_idx] = preds_val
    preds += preds_test / N_SPLITS


  oofs_score = av_metric(target, oofs)
  print(f'\n\nAV metric for oofs is {oofs_score}')

  feature_importances = feature_importances.reset_index(drop = True)
  fi = feature_importances.groupby('feature')['importance'].mean().sort_values(ascending = False)[:20][::-1]
  fi.plot(kind = 'barh', figsize=(12, 6))

  return oofs, preds , fi

def av_metric(y_true, y_pred):
  return np.sqrt(mean_squared_log_error(np.exp(y_true), np.exp(y_pred)))


### _XGBoostRegressor_

In [None]:
model = XGBRegressor(n_estimators = 1000,
                    max_depth = 6,
                    learning_rate = 0.05,
                    colsample_bytree = 0.5,
                    random_state=1452,
                    )

fit_params = {'verbose': 200, 'early_stopping_rounds': 200}

xgb_oofs, pred_xgb, fi = run_gradient_boosting(model, fit_params, X, y, test_data)



### _LGBMRegressor_

In [None]:
model = LGBMRegressor(n_estimators = 5000,
                        learning_rate = 0.01,
                        colsample_bytree = 0.76,
                        metric = 'None',
                        )
fit_params = {'verbose': 300, 'early_stopping_rounds': 200, 'eval_metric': 'rmse'}
lgb_oofs, pred_lgb, fi = run_gradient_boosting(model, fit_params, X, y, test_data)

### _CatBoostRegressor_

In [None]:
model = CatBoostRegressor(n_estimators = 3000,
                       learning_rate = 0.01,
                       rsm = 0.4, ## Analogous to colsample_bytree
                       random_state=2054,
                       )

fit_params = {'verbose': 200, 'early_stopping_rounds': 200}

cb_oofs,pred_cb, fi = run_gradient_boosting(model, fit_params, X, y, test_data)

### _Submission - 3_

In [None]:
v = np.round((np.exp(pred_xgb)-1 + np.exp(pred_lgb)-1 + np.exp(pred_cb)-1)/3).astype('int')
x = pd.read_csv('./sample_submission.csv')
x['Predicted'] = v
x.to_csv('ensemble_try.csv', index=False)

# **Annex A - Best private result on Kaggle - 1.60690 (unsubmitted)**

Readme:


1.   Run all the cells from "Environment & Global Variables Config", "Preprocessing Functions" & "Model Function" to define functions.
2.  To start processing data, run the "Main" cells at the bottom.
3.  The preprocessing ouput variables are called prep_df_train & prep_df_test. Those variables will be inputed in the model.
4.  The submission will be generated running the last cell of "Main code". The file will appear in the Colab left pannel and can be downloaded.



## Environment & Global Variables Config

### Package Installation & Import

In [None]:
!pip install catboost
!pip install lightgbm
!pip install langcodes
import gdown
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import langcodes
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import PowerTransformer
import cv2
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering
from sklearn.model_selection import (cross_val_score,StratifiedKFold)
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_selection import SelectKBest, mutual_info_regression  

from sklearn.metrics import mean_squared_log_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import KFold

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, 
    VotingRegressor, StackingRegressor, BaggingRegressor,
)
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, LinearSVR
from xgboost import XGBRegressor
from  matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn import linear_model
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from scipy.special import boxcox1p, inv_boxcox1p
from scipy.stats import boxcox_normmax
from scipy.stats import skew, norm
from scipy.stats import yeojohnson
from sklearn.preprocessing import PowerTransformer
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import RidgeCV


import os
import tensorflow as tf
import random
#from google.colab import drive


%matplotlib inline

# Set random seeds (for reproducibility requirement)
os.environ['PYTHONHASHSEED']=str(1)
tf.random.set_seed(1)
np.random.seed(1)
random.seed(1)
#TO INCLUDE OTHER LIBRARIES AS WELL

#drive.mount('/content/drive/')

### Datasets download

In [None]:
#gdown.download("https://drive.google.com/uc?id=1hE7ZuRTD8uBKLmls0NUrMCJEr4iXJDMn","train.csv",quiet=True);  #importing files from local Google Drive.
#gdown.download("https://drive.google.com/uc?id=105G0nu5i7It_ZUZgnF63ARYqABHNU4cB","test.csv",quiet=True);
gdown.download("https://drive.google.com/uc?id=1jQf-0Xw0wNymVGeAe_NmmFYOf5lJm5Ty","dataset.zip",quiet=True);
!unzip -qq dataset.zip

#!cp -R /content/drive/MyDrive/Python/IFT6758/ift6758-a20.zip .   #Autre méthode

### Global variables

In [None]:
def get_images(img_serie, path):
  img_df = [cv2.imread(path+img_name) for img_name in img_serie]
  return img_df

In [None]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv") 

img_train = get_images(df_train['Profile Image'],"./train_profile_images/profile_images_train/")
img_test = get_images(df_test['Profile Image'],"./test_profile_images/profile_images_test/")

## Data Exploration

### Training Dataset Exploration

In [None]:
df_train.info()

In [None]:
df_train.head(5)

In [None]:
sns.boxplot(x='Profile Category',y='Num of Profile Likes',hue='Is Profile View Size Customized?',data=df_train);

In [None]:
sns.boxplot(y='Num of Profile Likes',data=df_train);

In [None]:
#viewing Likes distribution along with URL (identified as important feature)
g = sns.kdeplot(df_train['Num of Profile Likes'][df_train['Personal URL'].str.len() > 12], label="URL Defined", shade=True, color="red")
g = sns.kdeplot(df_train['Num of Profile Likes'][~(df_train['Personal URL'].str.len() > 12)], label="URL Not Defined", shade=True, color="green")
plt.xlabel("Likes")
plt.ylabel("Density")
plt.legend()
plt.show()
#Distribution is very large -> log scale for Likes?

In [None]:
g = sns.kdeplot(np.log(1+df_train['Num of Profile Likes'][df_train['Personal URL'].str.len() > 12]), label="URL Defined", shade=True, color="red")
g = sns.kdeplot(np.log(1+df_train['Num of Profile Likes'][~(df_train['Personal URL'].str.len() > 12)]), label="URL Not Defined", shade=True, color="green")
plt.xlabel("Likes")
plt.ylabel("Density")
plt.legend()
plt.show()
#Much better in log scale. We can as well confirm the URL presence is an important feature.

In [None]:
list(img_train[0][0][0]) == [1,1,1]

In [None]:
df_train_img.head(1)

### Test Dataset Exploration

In [None]:
#Check missing values & other coner cases that would have been missed with Training Dataset.

df_test.info()

### Feature Engineering

In [None]:
#User Name feature
temp_fea = df_train['User Name'].apply(lambda x:len(x))
temp_fea.head(5)

In [None]:
#Personnal URL
temp_fea = df_train['Personal URL'].str.len() > 12      #to consider URL shorter than 12 chars as irrelevant.
temp_fea.head(5)

In [None]:
#Profile Cover Image Status -> missing values = Unknown category
temp_fea = df_train['Profile Cover Image Status'].copy()
temp_fea[(temp_fea != 'Set') & (temp_fea != 'Not set')] = 'Unknown'

print(temp_fea.head(5))
temp_fea[temp_fea == 'Unknown']

In [None]:
#Profile Verification Status -> to be One-Hot Encoded
temp_fea = df_train['Profile Verification Status'].copy()
temp_fea.head(5)

In [None]:
#temp_fea = df_train[['Profile Text Color','Profile Page Color','Profile Theme Color']].apply(lambda x:['000000' if pd.isnull(x) & (len(str(x)) > 6) else x,axis=1,result_type='broadcast')
temp_fea = pd.DataFrame()
temp = df_train[['Profile Text Color','Profile Page Color','Profile Theme Color']].applymap(lambda x:'000000' if (pd.isnull(x) | (len(str(x)) != 6)) else x)
reds,greens,blues = ['Text Red','Page Red','Theme Red'],['Text Green','Page Green','Theme Green'],['Text Blue','Page Blue','Theme Blue']
temp_fea[reds],temp_fea[greens],temp_fea[blues] = temp.applymap(lambda x:int(x[0:2],16)),temp.applymap(lambda x:int(x[2:4],16)),temp.applymap(lambda x:int(x[4:6],16))
temp_fea

In [None]:
#Is Profile View Size Customized? already in boolean, which is good
temp_fea = df_train['Is Profile View Size Customized?']
temp_fea.head(5)

In [None]:
#UTC Offset, Location & User Time Zone -> drop location, map UTC Offset and Time zone to complete UTC Offset missing data. Then convert UTC Offset to global regions.
tz_mapping = df_train[['UTC Offset','User Time Zone']].pivot_table(index=['User Time Zone','UTC Offset'],values='UTC Offset',aggfunc='count').reset_index(drop=False)
utc_labels = ['America','Europe & Africa','Middle East','East Asia']
temp_fea = df_train['User Time Zone'].apply(lambda x:utc_labels[np.digitize(tz_mapping[tz_mapping['User Time Zone']==x]['UTC Offset'].values[0],[-5400,9000,23400])] if not pd.isnull(x) else 'Unknown')
temp_fea


In [None]:
#Location Public Visibility & User Language -> lowercase & empty values management
temp_fea = df_train[['Location Public Visibility','User Language']].applymap(lambda x:x.lower() if not pd.isnull(x) else 'none')
temp_fea
lb = LabelBinarizer()
temp = lb.fit_transform(temp_fea['Location Public Visibility'])
lb.classes_
temp_fea = pd.concat([temp_fea,pd.DataFrame(temp,columns=lb.classes_)],axis=1).drop(['Location Public Visibility'], axis=1)
temp_fea

In [None]:
#Profile Creation Timestamp
temp_fea = df_train['Profile Creation Timestamp'].apply(lambda x:int(x[-4:]))
temp_fea.unique()

In [None]:
#Num of Followers, Num of People Following, Num of Status Updates, Num of Direct Messages, Avg Daily Profile Visit Duration in seconds & Avg Daily Profile Clicks
temp_fea = df_train['Num of Followers']
temp = temp_fea.fillna(value=temp_fea.median())

temp = StandardScaler().fit_transform(np.array(temp.copy()).reshape(-1,1))
temp

In [None]:
#Profile Category
df_train['Profile Category'].unique()
temp_fea = df_train['Profile Category'].apply(lambda x:x.lower() if (not pd.isnull(x) | (x==' ')) else 'unknown')
temp_fea.unique()

In [None]:
#Profile Likes -> change to Log scale
temp_fea = np.log(1+df_train['Num of Profile Likes'])
plt.hist(temp_fea,bins=50);

In [None]:
sns.boxplot(temp_fea)
#Even if outliers, we might consider not dropping zeros, as they might contain useful information for model.


In [None]:
Q1 = temp_fea.quantile(0.25)
Q3  = temp_fea.quantile(0.75)
IQR = Q3 - Q1
#print(Q3 +1.5*IQR)
#print(Q1 - 1.5*IQR)
print(temp_fea[(temp_fea > Q3 +1.5*IQR)])
#We could drop upper outlier only for Log Likes.

In [None]:
#Zero number of Likes: To drop or not to drop.
print(len(df_train['Personal URL'][(temp_fea == 0) & (df_train['Personal URL'].str.len() > 12)]))
print(len(df_train['Personal URL'][(temp_fea == 0) & ~(df_train['Personal URL'].str.len() > 12)]))
#80% have an URL


In [None]:
#Same process with all numerical features transformation.
#['Num of Followers','Num of People Following','Num of Status Updates','Num of Direct Messages','Avg Daily Profile Visit Duration in seconds','Avg Daily Profile Clicks','Num of Profile Likes']
fig,ax = plt.subplots(2,3,figsize=(20,7))
sns.kdeplot(df_train['Num of Followers'], label="URL Defined", shade=True, color="red",ax=ax[0,0])
sns.kdeplot(df_train['Num of People Following'], label="URL Defined", shade=True, color="red",ax=ax[0,1])
sns.kdeplot(df_train['Num of Status Updates'], label="URL Defined", shade=True, color="red",ax=ax[0,2])
sns.kdeplot(df_train['Num of Direct Messages'], label="URL Defined", shade=True, color="red",ax=ax[1,0])
sns.kdeplot(df_train['Avg Daily Profile Visit Duration in seconds'], label="URL Defined", shade=True, color="red",ax=ax[1,1])
sns.kdeplot(df_train['Avg Daily Profile Clicks'], label="URL Defined", shade=True, color="red",ax=ax[1,2])
plt.show()


In conclusion, Log transformation must by applied to all numerical feature except 'Avg Daily Profile Visit Duration in seconds'.

In [None]:
fig,ax = plt.subplots(2,3,figsize=(20,7))
sns.kdeplot(np.log(1+df_train['Num of Followers']), label="URL Defined", shade=True, color="red",ax=ax[0,0])
sns.kdeplot(np.log(1+df_train['Num of People Following']), label="URL Defined", shade=True, color="red",ax=ax[0,1])
sns.kdeplot(np.log(1+df_train['Num of Status Updates']), label="URL Defined", shade=True, color="red",ax=ax[0,2])
sns.kdeplot(np.log(1+df_train['Num of Direct Messages']), label="URL Defined", shade=True, color="red",ax=ax[1,0])
sns.kdeplot(df_train['Avg Daily Profile Visit Duration in seconds'], label="URL Defined", shade=True, color="red",ax=ax[1,1])
sns.kdeplot(np.log(1+df_train['Avg Daily Profile Clicks']), label="URL Defined", shade=True, color="red",ax=ax[1,2])
plt.show()

In [None]:
#feature extraction on image -> img_train[image#][line#][pixel#][color#(RBG)]
#Average color
temp_fea = []
for image in img_train:
  red = 0
  green = 0
  blue = 0
  for line in image:
    for pixel in line:
      red += pixel[0]
      green += pixel[1]
      blue += pixel[2]
  temp_fea.append([red/(32*32),green/(32*32),blue/(32*32)])

#red = red / (len(img_train)*len(img_train[0])*len(img_train[0][0]))
#green = green / (len(img_train)*len(img_train[0])*len(img_train[0][0]))
#blue = blue / (len(img_train)*len(img_train[0])*len(img_train[0][0]))
#print('Avg red color per pixel: ',red,'\nAvg red color per pixel: ',green,'\nAvg red color per pixel:',blue)

In [None]:
temp_fea = pd.DataFrame(temp_fea, columns=['Red','Green','Blue'])
temp_fea.head(5)

In [None]:
fig,ax = plt.subplots(1,3,figsize=(20,7))
sns.kdeplot(temp_fea["Red"], label="Red", shade=True, color="red",ax=ax[0])
sns.kdeplot(temp_fea["Green"], label="Green", shade=True, color="green",ax=ax[1])
sns.kdeplot(temp_fea["Blue"], label="Blue", shade=True, color="blue",ax=ax[2])
plt.show()

In [None]:
num_fea = ['Red','Green','Blue','Num of Profile Likes']
sns.pairplot(pd.concat([temp_fea,np.log(1+df_train['Num of Profile Likes'])],axis=1),x_vars=num_fea,y_vars=num_fea)

Not much correlation between colors and number of Likes.

In [None]:
#Number of black pixels exploration -> shape indicator
temp_fea = []

for image in img_train:
  black_one = 0
  black_zero = 0
  for line in image:
    for pixel in line:
      if list(pixel) == [1,1,1]:
        black_one += 1
      elif list(pixel) == [0,0,0]:
        black_zero += 1
  temp_fea.append([black_zero,black_one])
#black = sum(temp_fea) / (len(img_train))
#print('Avg black pixels per image: ',black)


In [None]:
temp_fea = pd.DataFrame(temp_fea,columns=['black_zero','black_one'])
bool_bg = temp_fea['black_zero'] > temp_fea['black_one']
print(bool_bg)

In [None]:
g = sns.kdeplot(np.log(1+df_train['Num of Profile Likes'][bool_bg]), label="0 background", shade=True, color="red")
g = sns.kdeplot(np.log(1+df_train['Num of Profile Likes'][~bool_bg]), label="1 background", shade=True, color="green")
plt.xlabel("Likes")
plt.ylabel("Density")
plt.legend()
plt.show()

Not much correlation with background color = [1,1,1] or [0,0,0].

In [None]:
num_fea = ['black_zero','black_one','Num of Profile Likes']
sns.pairplot(pd.concat([temp_fea,np.log(1+df_train['Num of Profile Likes'])],axis=1),x_vars=num_fea,y_vars=num_fea)

In [None]:
g = sns.relplot(x=temp_fea['black_one'], y=np.log(1+df_train['Num of Profile Likes']), kind="line")

In [None]:
len(temp_fea.loc[temp_fea[1] < 200])+len(temp_fea.loc[temp_fea[0] < 200])

In [None]:
#Image shape categorization

#1- convert to black and white
#2- correlate with number of black pixel +-5% OR use clustering with number of different shape (face types) -> too many different shapes
#3- Try to get male of female information


In [None]:
#Cross validation split
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

# Create dataset to use : PCA(n=2) reduced digits
X = df_train_img.iloc[:,0:3072]

pca = PCA(n_components=5)
X_r = pca.fit(X).transform(X)

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X_r, df_train_img['Num of Profile Likes'], test_size=0.5, random_state=0)

In [None]:
#Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.svm import SVR

from sklearn.cluster import SpectralClustering

from sklearn.model_selection import cross_val_score

# Create dataset to use : PCA(n=2) reduced digits
X = df_train_img.iloc[:,0:3072]

pca = PCA(n_components=5)
X_r = pca.fit(X).transform(X)

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X_r, df_train_img['Num of Profile Likes'], test_size=0.5, random_state=0)

# Set the parameters by cross-validation
''' tuned_parameters = [
  {'kernel': ['rbf'],
  'gamma': [1e-3, 1e-4],
  'C': [1, 2, 10, 100, 1000]},
  
  {'kernel': ['linear'],
   'C': [1, 2, 10, 100, 1000]},
] '''
tuned_parameters = [
  {'n_clusters': np.arange(100,501,100)}
]

metric = 'neg_mean_squared_log_error'

cv_strategy = KFold(n_splits=4, shuffle=True)

grid_search = GridSearchCV(
    KMeans(), tuned_parameters, scoring=metric, cv=cv_strategy
)
grid_search.fit(X_train, y_train);
print('Finished!')

In [None]:
print("Best parameters set found on development set:")
print()
print(grid_search.best_params_)

In [None]:
print("Grid scores on development set:")
print()
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

In [None]:
#À tester : SpectralClustering
#AgglomerativeClustering ->simple,ward

In [None]:
from sklearn import datasets
from sklearn.decomposition import PCA
import numpy as np

# Create dataset to use : PCA(n=2) reduced digits
X = df_train_img.iloc[:,0:3072]

pca = PCA(n_components=5)
X_r = pca.fit(X).transform(X)

In [None]:
from sklearn.cluster import KMeans

from sklearn.model_selection import train_test_split

X_train, X_valid = train_test_split(X_r, test_size=0.33, random_state=1)

In [None]:
valid_scores = []

for k in np.arange(1,100):

  kmeans = KMeans(n_clusters=k)
  kmeans.fit(X_train)
  valid_scores.append(kmeans.score(X_valid))

In [None]:
# Plotting the validation scores 
import matplotlib.pyplot as plt

plt.figure(figsize=(10,8))

plt.plot(valid_scores, c='g')
plt.xticks(np.arange(1,100,5))
plt.xlabel('k')
plt.ylabel('score')
plt.title('Validation score for different values of k with K-Means for PCA(n=5) images DF')
plt.plot();

In [None]:
valid_scores = []

for k in np.arange(1,100):

  kmeans = KMeans(n_clusters=k)
  kmeans.fit(X_train)
  valid_scores.append(kmeans.score(X_valid))

In [None]:
prep_img_df.drop(prep_img_df.iloc[:,0:3072],axis=1).head(5)

In [None]:
#ML for image feature extraction
import tensorflow as tf
from sklearn.metrics import silhouette_score
import cv2

images  = np.array(np.float32(img_train).reshape(len(img_train), -1)/255)

model = tf.keras.applications.MobileNetV2(include_top=False,weights='imagenet', input_shape=(32, 32, 3))
predictions = model.predict(images.reshape(-1, 32, 32, 3))
pred_images = predictions.reshape(images.shape[0], -1)

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    pred_images, df_train_img['Num of Profile Likes'], test_size=0.5, random_state=0)

In [None]:
len(predictions)

For KMeans score : increase becomes linear at K = 20

In [None]:
''' X_train, X_test, y_train, y_test = train_test_split(prep_df[cols_x], prep_df[cols_y], 
                                                    test_size=0.3, 
                                                    shuffle=True, 
                                                    random_state=76 #  To guarantee that the split will always be the same
                                                    )   '''

result = pd.DataFrame(columns = ("Classifiers","Training","Testing"))
train_scores, test_scores = [],[]

In [None]:
#Spectral Clustering

clf_ext = ExtraTreesRegressor(
    max_features='auto',
    bootstrap=True,
    oob_score=True,
    #n_estimators=1000,
    #max_depth=None,
    max_depth = 10,
    min_samples_leaf = 10,
    n_estimators = 2000,
    #min_samples_split=10
    min_samples_split = 20
    #class_weight="balanced",
    #min_weight_fraction_leaf=0.02
    )

train_scores, test_scores = [],[]

for k in np.arange(1,30,5):
  kspec = SpectralClustering(n_clusters=k,random_state=1,n_neighbors=5,affinity='nearest_neighbors')
  clusters = pd.DataFrame(kspec.fit_predict(X_train,y_train))
  #temp_df = pd.concat([y_train,clusters],axis=1)
  
  #clf_ext = clf_ext.fit(np.array(temp_df[0]).reshape(-1, 1),temp_df['Num of Profile Likes'])
  clf_ext = clf_ext.fit(clusters,y_train)

  train_scores.append(np.sqrt(mean_squared_log_error(y_train, clf_ext.predict(clusters).reshape(-1, 1))))
  test_scores.append(np.sqrt(mean_squared_log_error(y_test, clf_ext.predict(kspec.fit_predict(X_test).reshape(-1, 1)))))


In [None]:
print('Train RSMLE: ',train_scores)
print('Test RSMLE: ',test_scores)

In [None]:
#K-Means Clustering

clf_ext = ExtraTreesRegressor(
    max_features='auto',
    bootstrap=True,
    oob_score=True,
    #n_estimators=1000,
    #max_depth=None,
    max_depth = 10,
    min_samples_leaf = 10,
    n_estimators = 2000,
    #min_samples_split=10
    min_samples_split = 20
    #class_weight="balanced",
    #min_weight_fraction_leaf=0.02
    )

train_scores, test_scores = [],[]

for k in np.arange(1,30,5):
  kmeans = KMeans(n_clusters=k,random_state=1)
  clusters = pd.DataFrame(kmeans.fit_predict(X_train,y_train))
  #temp_df = pd.concat([y_train,clusters],axis=1)
  
  #clf_ext = clf_ext.fit(np.array(temp_df[0]).reshape(-1, 1),temp_df['Num of Profile Likes'])
  clf_ext = clf_ext.fit(clusters,y_train)

  train_scores.append(np.sqrt(mean_squared_log_error(y_train, clf_ext.predict(clusters).reshape(-1, 1))))
  test_scores.append(np.sqrt(mean_squared_log_error(y_test, clf_ext.predict(kmeans.fit_predict(X_test).reshape(-1, 1)))))

In [None]:
print('Train RSMLE: ',train_scores)
print('Test RSMLE: ',test_scores)

In [None]:
#Final model testing

from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering

kmeans = KMeans(n_clusters=20)
kspec = SpectralClustering(n_clusters=20)
kagg = AgglomerativeClustering(n_clusters=20)

kmeans.fit(X)

In [None]:
#correlation with nbr of likes
df_train_img = pd.concat([df_train['Num of Profile Likes'],pd.DataFrame(np.array(img_train).reshape([7500,-1]))],axis=1)


In [None]:
#image pixel bits inversion
inv_df_img = pd.DataFrame(np.array(img_train).reshape([7500,-1])).applymap(lambda x:sum(1<<(8-1-i) for i in range(8) if x>>i&1))
inv_df_img = pd.concat([df_train['Num of Profile Likes'],inv_df_img],axis=1)

In [None]:
corr_train = df_train_img.corr().abs().sort_values(by=["Num of Profile Likes"],ascending=False)
corr_train.head(10)

In [None]:
top = 100
bool_ar = np.array([255 if (i in corr_train.head(top).index) else 0 for i in range(0,len(df_train_img.columns)-1)])
bool_ar = bool_ar.reshape(32,32,3)
print(corr_train.head(top)["Num of Profile Likes"].sum()/corr_train["Num of Profile Likes"].sum())

In [None]:
plt.imshow(X=bool_ar,cmap='gray')
plt.show()

In [None]:
print(pixel_index_selection)

In [None]:
len(df_train_img.columns)

In [None]:
df_img_test = pd.DataFrame()
pixel_index_selection = [2043, 1947, 2046, 2142, 1893, 2139, 1986, 2238, 1944, 2079, 1950, 2337, 1848, 1752, 1509, 1797, 1983, 2181, 2040, 2082, 1413, 1989, 
                            1851, 2274, 1656, 1566, 2241, 1605, 1896, 1992, 1701, 2178, 1599, 2268, 2235, 1695, 2271, 1560, 2175, 1890, 2364, 2085, 1293, 2367, 
                            1662, 1290, 1287, 1464, 1857, 1414, 1296, 1488, 2370, 525, 1506, 1368, 2340, 1284, 1761, 1299, 1884, 1485, 1758, 1677, 1791, 1117, 
                            1680, 1302, 2334, 2145, 1305, 1779, 1389, 1212, 1308, 1584, 1773, 1776, 1114, 1392, 1788, 1899, 2160, 1770, 1860, 2157, 1881, 1206, 
                            1581, 1395, 1872, 1386, 1869, 1767, 1749, 2361, 2344, 1473, 1203]
  
selected_color_pixels = np.array([True if (i in pixel_index_selection) else False for i in range(0,len(df_train_img.columns)-1)])
  


In [None]:
col_names = ['pixel_' + str(i) for i in sorted(pixel_index_selection)]
print(col_names)

In [None]:
df_img_test = pd.DataFrame(np.array(img_train).reshape([len(img_train),-1])).loc[:,selected_color_pixels]
df_img_test.columns = col_names

In [None]:
df_img_test['Sum Selected Pixels'] = df_img_test[col_names].sum(axis=1)

In [None]:
np.sum(df_img_test.loc[0,:])

In [None]:
df_img_test

In [None]:
corr_train.head(top).index

In [None]:
corr_sum = []
tot_corr = corr_train["Num of Profile Likes"].sum()
for top in range(1,1000):
  bool_ar = np.array([255 if (i in corr_train.head(top).index) else 0 for i in range(0,len(df_train_img.columns)-1)])
  bool_ar = bool_ar.reshape(32,32,3)
  corr_sum.append(corr_train.head(top)["Num of Profile Likes"].sum()/tot_corr)


In [None]:
plt.plot(range(1,1000),corr_sum)
plt.show()

In [None]:
plt.plot(range(1,3073),corr_train["Num of Profile Likes"][1:3073])

In [None]:
corr_train["Num of Profile Likes"][1:3070]

In [None]:
df_train_img.columns

In [None]:
df_clust_train = pd.DataFrame()
df_clust_test = pd.DataFrame()

#Neural Network for image feature extraction - didn't show better results. Maybe to explore further.
''' images  = np.array(np.float32(img_train).reshape(len(img_train), -1)/255)
model = tf.keras.applications.MobileNetV2(include_top=False,weights='imagenet', input_shape=(32, 32, 3))
predictions = model.predict(images.reshape(-1, 32, 32, 3))
pred_images = predictions.reshape(images.shape[0], -1) '''

df_train_img = pd.DataFrame(np.array(img_train).reshape([len(img_train),-1]))
df_test_img = pd.DataFrame(np.array(img_test).reshape([len(img_test),-1]))

#PCA can also be applied prior to clustering
pca = PCA(n_components=5)
df_train_img_pca = pca.fit(df_train_img).transform(df_train_img)
df_test_img_pca = pca.transform(df_test_img)

#Spectral Clustering
kspec = SpectralClustering(n_clusters=20,random_state=1,n_neighbors=5,affinity='nearest_neighbors')
df_clust_train['Spectral Clusters'] = kspec.fit_predict(df_train_img_pca)
df_clust_test['Spectral Clusters'] = kspec.fit_predict(df_test_img_pca)

#K-Means Clustering
kmeans = KMeans(n_clusters=20,random_state=1)
df_clust_train['KMeans Clusters'] = pd.DataFrame(kmeans.fit_predict(df_train_img_pca))
df_clust_test['KMeans Clusters'] = pd.DataFrame(kmeans.fit_predict(df_test_img_pca))

In [None]:
df_clust_train

## Preprocessing Functions


### Tidying Function

In [None]:
def tidyer(dataset_train, dataset_test):
  tidy_dataset_list = []
  tidy_dataset = pd.DataFrame()

  #Time zone & UTC Offset mapping table for missing values
  merge_df_time = pd.concat([dataset_train[['UTC Offset','User Time Zone']],dataset_test[['UTC Offset','User Time Zone']]], axis=0)    #mapping table
  tz_mapping = merge_df_time.pivot_table(index=['User Time Zone','UTC Offset'],values='UTC Offset',aggfunc='count').reset_index(drop=False)    #mapping table
  utc_labels = ['America','Europe & Africa','Middle East','East Asia']  

  #Num of profiles likes only in traning set
  tidy_dataset['Likes'] = np.log(1+dataset_train['Num of Profile Likes'])
  
  for dataset in [dataset_train,dataset_test]:
    tidy_dataset['Name length'] = dataset['User Name'].apply(lambda x:len(x))
    tidy_dataset['URL bool'] = dataset['Personal URL'].str.len() > 12

    tidy_dataset['Cover Image Status'] = dataset['Profile Cover Image Status'].copy()
    tidy_dataset['Cover Image Status'][(tidy_dataset['Cover Image Status'] != 'Set') & (tidy_dataset['Cover Image Status'] != 'Not set')] = 'Unknown'

    tidy_dataset['Profile Verification Status'] = dataset['Profile Verification Status'].copy()

    #old method to convert color features to RGB colors
    ''' temp_ds = temp_fea = dataset[['Profile Text Color','Profile Page Color','Profile Theme Color']].applymap(lambda x:'000000' if (pd.isnull(x) | (len(str(x)) != 6)) else x)
    reds,greens,blues = ['Text Red','Page Red','Theme Red'],['Text Green','Page Green','Theme Green'],['Text Blue','Page Blue','Theme Blue']
    tidy_dataset[reds],tidy_dataset[greens],tidy_dataset[blues] = temp_ds.applymap(lambda x:int(x[0:2],16)),temp_ds.applymap(lambda x:int(x[2:4],16)),temp_ds.applymap(lambda x:int(x[4:6],16))
    '''

    temp_ds = dataset[['Profile Text Color','Profile Page Color','Profile Theme Color']].applymap(lambda x:'000000' if (pd.isnull(x) | (len(str(x)) != 6)) else x)
    tidy_dataset[['Profile Text Color bool','Profile Page Color bool','Profile Theme Color bool']] = temp_ds[['Profile Text Color',
                                                                                                          'Profile Page Color',
                                                                                                          'Profile Theme Color']].applymap(lambda x:False if x=="000000" else True)

    tidy_dataset['View Customized bool'] = dataset['Is Profile View Size Customized?'].copy()

    #tz_mapping = dataset[['UTC Offset','User Time Zone']].pivot_table(index=['User Time Zone','UTC Offset'],values='UTC Offset',aggfunc='count').reset_index(drop=False)    #mapping table
            
    tidy_dataset['Time Region'] = dataset['User Time Zone'].apply(lambda x:utc_labels[np.digitize(tz_mapping[tz_mapping['User Time Zone']==x]['UTC Offset'].values[0],[-5400,9000,23400])] if not pd.isnull(x) else 'Unknown')
    
    tidy_dataset['Location Public Visibility'] = dataset['Location Public Visibility'].apply(lambda x:x.lower() if not pd.isnull(x) else 'unknown')
    tidy_dataset['User Language'] =  dataset['User Language'].apply(lambda x:langcodes.Language.get(x).describe()['language'])
    tidy_dataset['Profile Creation Year'] = dataset['Profile Creation Timestamp'].apply(lambda x:int(x[-4:]))

    tidy_dataset['Num of Followers'] = np.log(1+dataset['Num of Followers'].fillna(value=dataset['Num of Followers'].median()))
    tidy_dataset['Num of People Following'] = np.log(1+dataset['Num of People Following'].fillna(value=dataset['Num of People Following'].median()))
    tidy_dataset['Num of Status Updates'] = np.log(1+dataset['Num of Status Updates'].fillna(value=dataset['Num of Status Updates'].median()))
    tidy_dataset['Num of Direct Messages'] = np.log(1+dataset['Num of Direct Messages'].fillna(value=dataset['Num of Direct Messages'].median()))
    tidy_dataset['Avg Daily Profile Visit Duration in seconds'] = dataset['Avg Daily Profile Visit Duration in seconds'].fillna(value=dataset['Avg Daily Profile Visit Duration in seconds'].median())
    tidy_dataset['Avg Daily Profile Clicks'] = np.log(1+dataset['Avg Daily Profile Clicks'].fillna(value=dataset['Avg Daily Profile Clicks'].median()))

    tidy_dataset['Profile Category'] = dataset['Profile Category'].apply(lambda x:x.lower() if (not pd.isnull(x) | (x==' ')) else 'unknown')

    tidy_dataset_list.append(tidy_dataset)
    tidy_dataset = pd.DataFrame()           #reinitialize DF for test set

  return tidy_dataset_list[0],tidy_dataset_list[1]

### Image Feature Extraction

In [None]:
def img_extr(img_array,pixel_index_selection):

  df_img = pd.DataFrame()

  #Pixel selection based on correlation analysis
  col_names = ['pixel_' + str(i) for i in sorted(pixel_index_selection)]
  selected_color_pixels = np.array([True if (i in pixel_index_selection) else False for i in range(0,3072)])
  
  df_img = pd.DataFrame(np.array(img_array).reshape([len(img_array),-1])).loc[:,selected_color_pixels]
  df_img.columns = col_names

  #Sum of selected pixel values
  df_img['Sum Selected Pixels'] = df_img.sum(axis=1)

  #Number of black pixels & average colors exploration
  temp_fea = []

  for image in img_array:
    black_one = 0
    black_zero = 0
    red = 0
    green = 0
    blue = 0
    for line in image:
      for pixel in line:
        red += pixel[0]
        green += pixel[1]
        blue += pixel[2]
        if list(pixel) == [1,1,1]:
          black_one += 1
        elif list(pixel) == [0,0,0]:
          black_zero += 1
    temp_fea.append([black_zero,black_one,red/(32*32),green/(32*32),blue/(32*32)])
  temp_fea = pd.DataFrame(temp_fea,columns=['black_zero','black_one','red','green','blue'])

  df_img['Img Black Zeros bool'] = temp_fea['black_zero'] > temp_fea['black_one']
  df_img['Img Num Black Pixels'] = temp_fea['black_zero'] + temp_fea['black_one']
  df_img['Img Average Red level'],df_img['Img Average Green level'],df_img['Img Average Blue level'] = temp_fea['red'],temp_fea['green'],temp_fea['blue']

  return df_img

In [None]:
def img_clust(img_array_train,img_array_test):

  df_clust_train,df_clust_test = pd.DataFrame(),pd.DataFrame()

  #Neural Network for image feature extraction - didn't show better results. Maybe to explore further.
  ''' images  = np.array(np.float32(img_train).reshape(len(img_train), -1)/255)
  model = tf.keras.applications.MobileNetV2(include_top=False,weights='imagenet', input_shape=(32, 32, 3))
  predictions = model.predict(images.reshape(-1, 32, 32, 3))
  pred_images = predictions.reshape(images.shape[0], -1) '''

  df_train_img = pd.DataFrame(np.array(img_array_train).reshape([len(img_array_train),-1]))
  df_test_img = pd.DataFrame(np.array(img_array_test).reshape([len(img_array_test),-1]))

  #PCA can also be applied prior to clustering
  pca = PCA(n_components=5,random_state=42)
  df_train_img_pca = pca.fit(df_train_img).transform(df_train_img)
  df_test_img_pca = pca.transform(df_test_img)

  #Spectral Clustering
  kspec = SpectralClustering(n_clusters=20,random_state=42,n_neighbors=5,affinity='nearest_neighbors')
  df_clust_train['Img Spectral Clusters'] = kspec.fit_predict(df_train_img_pca).astype(np.str)
  df_clust_test['Img Spectral Clusters'] = kspec.fit_predict(df_test_img_pca).astype(np.str)

  #K-Means Clustering
  kmeans = KMeans(n_clusters=20,random_state=42)
  df_clust_train['Img KMeans Clusters'] = pd.DataFrame(kmeans.fit_predict(df_train_img_pca).astype(np.str))
  df_clust_test['Img KMeans Clusters'] = pd.DataFrame(kmeans.predict(df_test_img_pca).astype(np.str))
  
  return df_clust_train,df_clust_test

### Feature Standardization & Encoding Function

In [None]:
def standard_encoder(dataset,test_data, num_fea,cat_fea):
  standard_dataset = dataset.copy()
  test_dataset = test_data.copy()

  for feature in cat_fea:
    lb = LabelBinarizer()
    temp_ds = lb.fit_transform(standard_dataset[feature])
    standard_dataset = pd.concat([standard_dataset,pd.DataFrame(temp_ds,columns=[feature + ' ' + label for label in lb.classes_])],axis=1).drop([feature], axis=1)
    temp_ds = lb.transform(test_dataset[feature])
    test_dataset = pd.concat([test_dataset, pd.DataFrame(temp_ds,columns=[feature + ' ' + label for label in lb.classes_])],axis=1).drop([feature], axis=1) 


  for feature in num_fea:
    sc = StandardScaler()
    standard_dataset[feature] = sc.fit_transform(np.array(standard_dataset[feature]).reshape(-1,1))
    test_dataset[feature] = sc.transform(np.array(test_dataset[feature]).reshape(-1,1))
  # test_dataset.head()
  
  return standard_dataset, test_dataset

### Feature Selection Function

In [None]:
def feature_select(dataset_train):
  corr_prep = dataset_train.corr().abs().sort_values(by=["Likes"],ascending=False)

  return corr_prep

## Further Data Exploration and Model Exploration

### Tidy set Visualization

In [None]:
tidy_df_train,tidy_df_test = tidyer(df_train,df_test)
#tidy_df_test = tidyer(df_test,False)

#pixel selection
pixel_index_selection = [2043, 1947, 2046, 2142, 1893, 2139, 1986, 2238, 1944, 2079, 1950, 2337, 1848, 1752, 1509, 1797, 1983, 2181, 2040, 2082, 1413, 1989, 
                            1851, 2274, 1656, 1566, 2241, 1605, 1896, 1992, 1701, 2178, 1599, 2268, 2235, 1695, 2271, 1560, 2175, 1890, 2364, 2085, 1293, 2367, 
                            1662, 1290, 1287, 1464, 1857, 1414, 1296, 1488, 2370, 525, 1506, 1368, 2340, 1284, 1761, 1299, 1884, 1485, 1758, 1677, 1791, 1117, 
                            1680, 1302, 2334, 2145, 1305, 1779, 1389, 1212, 1308, 1584, 1773, 1776, 1114, 1392, 1788, 1899, 2160, 1770, 1860, 2157, 1881, 1206, 
                            1581, 1395, 1872, 1386, 1869, 1767, 1749, 2361, 2344, 1473, 1203]

tidy_df_train = pd.concat([tidy_df_train,img_extr(img_train,pixel_index_selection)],axis=1)
tidy_df_test = pd.concat([tidy_df_test,img_extr(img_test,pixel_index_selection)],axis=1)
img_clust_train,img_clust_test = img_clust(img_train,img_test)
tidy_df_train,tidy_df_test = pd.concat([tidy_df_train,img_clust_train],axis=1),pd.concat([tidy_df_test,img_clust_test],axis=1)


num_fea = ['Name length','Profile Creation Year','Num of Followers','Num of People Following','Num of Status Updates','Num of Direct Messages','Avg Daily Profile Visit Duration in seconds',
           'Avg Daily Profile Clicks','Img Num Black Pixels', 'Img Average Red level',	'Img Average Green level',	'Img Average Blue level',
           'Sum Selected Pixels'] + ['pixel_' + str(i) for i in sorted(pixel_index_selection)]
cat_fea = ['Cover Image Status','Profile Verification Status','Time Region','Location Public Visibility','User Language','Profile Category','Img Spectral Clusters','Img KMeans Clusters']
prep_df_train,prep_df_test = standard_encoder(tidy_df_train,tidy_df_test,num_fea,cat_fea)
#prep_df.info()

In [None]:
tidy_df_train.head(5)

In [None]:
prep_df_train.head(5)

In [None]:
prep_df_test.head(5)

In [None]:
sns.boxplot(x='URL bool',y='Likes',data=tidy_df_train)

In [None]:
num_fea = ['Num of Followers','Num of People Following','Num of Status Updates','Num of Direct Messages','Avg Daily Profile Visit Duration in seconds','Avg Daily Profile Clicks','Likes']
sns.pairplot(tidy_df_train,x_vars=num_fea,y_vars=num_fea,hue='URL bool')

In [None]:
num_fea = ['Name length','Text Red','Page Red','Theme Red','Text Green','Page Green','Theme Green','Text Blue','Page Blue','Theme Blue','Profile Creation Year','Likes']
sns.pairplot(tidy_df_train,x_vars=num_fea,y_vars=num_fea,hue='Profile Verification Status')

Most of numerical features does not show good correlation with number fo Likes. The Number of People Following and the Average Profile Clicks are the one that have a little correlation.

In [None]:
prep_df_train

### Feature Selection Exploration

In [None]:
corr_prep = prep_df_train.corr().abs().sort_values(by=["Likes"],ascending=False)
for i in range(0,len(corr_prep.index)):
  print(i,': ', corr_prep.index[i],' -> ', corr_prep['Likes'][i])

In [None]:
for i in prep_df_train.columns:
  print(i)

In [None]:
corr_prep['Likes'][0]

In [None]:
fea_sel_mod = SelectKBest(score_func=mutual_info_regression, k=50)
fea_sel_mod.fit(prep_df_train.iloc[:,1:203],prep_df_train.iloc[:,0])
fea_sel_prep = pd.DataFrame({'Feature names':prep_df_train.iloc[:,1:203].columns,'Feature selection score':fea_sel_mod.scores_}).sort_values(by=["Feature selection score"],ascending=False).reset_index(drop=True)
for i in range(0,len(fea_sel_prep.index)):
  print(i,': ', fea_sel_prep['Feature names'][i],' -> ', fea_sel_prep['Feature selection score'][i])

### Model Exploration

In [None]:
from sklearn.metrics import mean_squared_log_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import KFold

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, 
    VotingRegressor, StackingRegressor, BaggingRegressor,
)
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, LinearSVR
from xgboost import XGBRegressor
import seaborn as sns
from  matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn import linear_model
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from scipy.special import boxcox1p, inv_boxcox1p
from scipy.stats import boxcox_normmax
from scipy.stats import skew, norm
from scipy.stats import yeojohnson
from sklearn.preprocessing import PowerTransformer
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import RidgeCV

#Correlration based
all_features = corr_prep.index
all_features = all_features[1:]

#all_features = fea_sel_prep['Feature names']

#Top correlated features
#cols_x = ['URL bool','Num of People Following','Num of Status Updates','Location Public Visibility disabled','Location Public Visibility enabled','Profile Category unknown','Avg Daily Profile Clicks','Profile Verification Status Verified','Profile Verification Status Not verified','Sum Selected Pixels']
cols_x = all_features[0:35]
cols_y = 'Likes'

#All X features
#.iloc[:,1:65]

X_train, X_test, y_train, y_test = train_test_split(prep_df_train[cols_x], prep_df_train[cols_y], 
                                                    test_size=0.3, 
                                                    shuffle=True, 
                                                    random_state=76 #  To guarantee that the split will always be the same
                                                    )  

In [None]:
result = pd.DataFrame(columns = ("Classifiers","Training","Testing"))
classifier, train_scores, test_scores = [],[],[]

In [None]:
clf_tree = DecisionTreeRegressor(max_depth=10)
clf_tree = clf_tree.fit(X_train,y_train)
classifier.append("DecisionTreeRegressor")
train_scores.append(np.sqrt(mean_squared_log_error(np.exp(y_train), np.exp(clf_tree.predict(X_train)))))
test_scores.append(np.sqrt(mean_squared_log_error(np.exp(y_test), np.exp(clf_tree.predict(X_test)))))

In [None]:
clf_lin = LinearRegression()
clf_lin = clf_lin.fit(X_train,y_train)
classifier.append("LinearRegression")
train_scores.append(np.sqrt(mean_squared_log_error(np.exp(y_train), np.exp(clf_lin.predict(X_train)))))
test_scores.append(np.sqrt(mean_squared_log_error(np.exp(y_test), np.exp(clf_lin.predict(X_test)))))

In [None]:
knn = KNeighborsRegressor(n_neighbors = 3)
knn.fit(X_train, y_train)
classifier.append("KNeighborsRegressor")
train_scores.append(np.sqrt(mean_squared_log_error(np.exp(y_train), np.exp(knn.predict(X_train)))))
test_scores.append(np.sqrt(mean_squared_log_error(np.exp(y_test), np.exp(knn.predict(X_test)))))

In [None]:
random_forest = RandomForestRegressor(n_estimators=100)
random_forest.fit(X_train, y_train)
classifier.append("RandomForestRegressor")
train_scores.append(np.sqrt(mean_squared_log_error(np.exp(y_train), np.exp(random_forest.predict(X_train)))))
test_scores.append(np.sqrt(mean_squared_log_error(np.exp(y_test), np.exp(random_forest.predict(X_test)))))

In [None]:
clf_ext = ExtraTreesRegressor(
    max_features='auto',
    bootstrap=True,
    oob_score=True,
    #n_estimators=1000,
    #max_depth=None,
    max_depth = 10,
    min_samples_leaf = 10,
    n_estimators = 2000,
    #min_samples_split=10
    min_samples_split = 20
    #class_weight="balanced",
    #min_weight_fraction_leaf=0.02
    )
clf_ext = clf_ext.fit(X_train,y_train)
classifier.append("ExtraTreesRegressor")
train_scores.append(np.sqrt(mean_squared_log_error(np.exp(y_train), np.exp(clf_ext.predict(X_train)))))
test_scores.append(np.sqrt(mean_squared_log_error(np.exp(y_test), np.exp(clf_ext.predict(X_test)))))

In [None]:
clf_ada = AdaBoostRegressor(n_estimators=400, learning_rate=0.1)
clf_ada.fit(X_train,y_train)
classifier.append("AdaBoostRegressor")
train_scores.append(np.sqrt(mean_squared_log_error(np.exp(y_train), np.exp(clf_ada.predict(X_train)))))
test_scores.append(np.sqrt(mean_squared_log_error(np.exp(y_test), np.exp(clf_ada.predict(X_test)))))

In [None]:
#Stacking
m_rf = RandomForestRegressor(n_estimators=200,
                              max_depth=2,
                              min_samples_split=8,
                              min_samples_leaf=8,
                              max_features=None,
                              oob_score=True,
                              random_state=42)

m_svr = SVR(C=20, epsilon=0.004, gamma=0.0001)

m_ab = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=5),
                        n_estimators=200,
                        loss='exponential',
                        learning_rate=0.01,random_state=42)
m_bag = BaggingRegressor(n_estimators=15,max_samples=10,max_features=5,random_state=42)

xgboost = XGBRegressor(learning_rate=0.01,
                      n_estimators=1000,
                      max_depth=5,
                      gamma=0.3,
                      subsample=0.9,
                      colsample_bytree=0.8,
                      objective='reg:squarederror',
                      nthread=-1,
                      scale_pos_weight=1,
                      seed=27,
                      reg_alpha=0.00009,
                      random_state=42)
m_gbr = GradientBoostingRegressor(n_estimators=500,
                              learning_rate=0.03,
                              max_depth=4,
                              max_features='sqrt',
                              min_samples_leaf=30,
                              min_samples_split=20,
                              loss='ls',
                              random_state=42)
m_catboost = CatBoostRegressor(n_estimators=1000,
                              learning_rate=0.01,
                              rsm=0.4,
                              random_state=42)
model = StackingRegressor(
      estimators=[
          ('m_rf', m_rf),
          # ('m_ridge', m_ridge),
          ('m_svr', m_svr),
          ('m_bag', m_bag),
          # ('m_ab', m_ab),
          ('m_gbr', m_gbr),
          ('xgboost', xgboost),
          # ('m_catboost', m_catboost),
      ],
      n_jobs=-1,
      verbose=0.5)

model.fit(X_train,y_train)
classifier.append("Stack")
train_scores.append(np.sqrt(mean_squared_log_error(np.exp(y_train), np.exp(model.predict(X_train)))))
test_scores.append(np.sqrt(mean_squared_log_error(np.exp(y_test), np.exp(model.predict(X_test)))))

In [None]:
#,for i,text in enumerate(classifier):
    #result.loc[i+1] = [text,np.around(train_scores[i], decimals = 2),np.around(test_scores[i], decimals=2)]
print(train_scores,'\n\n',test_scores)


In [None]:
np.exp(y_test)

In [None]:
np.exp(clf_tree.predict(X_test))

In [None]:
m_rf = RandomForestRegressor(n_estimators=200,
                              max_depth=2,
                              min_samples_split=8,
                              min_samples_leaf=8,
                              max_features=None,
                              oob_score=True,
                              random_state=42)

kf = KFold(n_splits=5, random_state=42, shuffle=True)
ridge_alphas = [1e-15, 1e-10, 1e-8, 9e-4, 7e-4, 5e-4, 3e-4, 1e-4, 1e-3, 5e-2, 
              1e-2, 0.1, 0.3, 1, 3, 5, 10, 15, 18, 50, 75, 100]
m_ridge = RidgeCV(alphas=ridge_alphas, cv=kf)
m_svr = SVR(C=20, epsilon=0.004, gamma=0.0001)

# params = {'n_estimators': 500,
#           'max_depth': 4,
#           'min_samples_split': 5,
#           'learning_rate': 0.01,
#           'loss': 'ls'}
# m_gbr = GradientBoostingRegressor(**params)
m_ab = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=5),
                        n_estimators=200,
                        loss='exponential',
                        learning_rate=0.01,random_state=42)
m_bag = BaggingRegressor(n_estimators=15,max_samples=10,max_features=5,random_state=42)
# m_knn = KNeighborsRegressor()
# m_svr = SVR()
# model = VotingRegressor(
#     estimators=[('rf', m_rf), ('ridge', m_ridge)], 
#     n_jobs=-1)
xgboost = XGBRegressor(learning_rate=0.01,
                      n_estimators=1000,
                      max_depth=5,
                      gamma=0.3,
                      subsample=0.9,
                      colsample_bytree=0.8,
                      objective='reg:squarederror',
                      nthread=-1,
                      scale_pos_weight=1,
                      seed=27,
                      reg_alpha=0.00009,
                      random_state=42)
m_gbr = GradientBoostingRegressor(n_estimators=500,
                              learning_rate=0.03,
                              max_depth=4,
                              max_features='sqrt',
                              min_samples_leaf=30,
                              min_samples_split=20,
                              loss='ls',
                              random_state=42)
m_catboost = CatBoostRegressor(n_estimators=1000,
                              learning_rate=0.01,
                              rsm=0.4,
                              random_state=42)
model = StackingRegressor(
      estimators=[
          ('m_rf', m_rf),
          # ('m_ridge', m_ridge),
          ('m_svr', m_svr),
          ('m_bag', m_bag),
          # ('m_ab', m_ab),
          ('m_gbr', m_gbr),
          ('xgboost', xgboost),
          # ('m_catboost', m_catboost),
      ],
      n_jobs=-1,
      verbose=0.5)

cv = KFold(n_splits=5, random_state=42, shuffle=True)
pred = cross_val_predict(model, X_train, y_train, cv=cv)
pred = np.abs(pred) # Take abs for negative values due to RMSLE metric
#print(rmsle(np.expm1(y_train), np.expm1(pred)))

In [None]:
pred

In [None]:
print(np.sqrt(mean_squared_log_error(np.expm1(y_train), np.expm1(pred))))

In [None]:
#Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV 

from sklearn.cluster import SpectralClustering

from sklearn.model_selection import cross_val_score

# Set the parameters by cross-validation
''' tuned_parameters = [
  {'m_rf__n_estimators': [50, 200, 400],
  'm_rf__max_depth': [2, 4, 7],
  'm_rf__min_samples_split': [3, 5, 8],
  'm_rf__max_features': [3, 5, 8]},

  #RandomForestRegressor(n_estimators=200,
                              #max_depth=4,
                              #min_samples_split=5,
                              #min_samples_leaf=5,
                              #max_features=None,
                              #oob_score=True,
                              #random_state=42)

  {'m_svr__epsilon': [0.004, 0.008, 0.0016],
   'm_svr__C': [10, 20, 30],
   'm_svr__gamma': [0.0001, 0.0003, 0.0007]},

   #m_svr = SVR(C=20, epsilon=0.008, gamma=0.0003)

  {'m_bag__n_estimators': [5, 10, 15],
   'm_bag__max_samples': [1, 5, 10],
   'm_bag__max_features': [1, 5, 10]},

  #m_bag = BaggingRegressor()

  {'m_gbr__n_estimators': [250, 500, 750],
   'm_gbr__learning_rate': [0.01, 0.03, 0.05],
   'm_gbr__max_features': ['auto', 'sqrt', 'log2'],
   'm_gbr__min_samples_leaf': [7, 15, 30],
   'm_gbr__min_samples_split': [7, 10, 20],
   'm_gbr__loss': ['huber', 'ls', 'lad']},

  #m_gbr = GradientBoostingRegressor(n_estimators=500,
                              #learning_rate=0.01,
                              #max_depth=4,
                              #max_features='sqrt',
                              #min_samples_leaf=15,
                              #min_samples_split=10,
                              #loss='huber',
                              #random_state=42)

  {'xgboost__n_estimators': [500, 1000, 1500],
   'xgboost__learning_rate': [0.01, 0.03, 0.05],
   'xgboost__objective': ['reg:squarederror'],
   'xgboost__max_depth': [3, 5, 10],
   'xgboost__gamma': [0.3, 0.6, 0.9],
   'xgboost__subsample': [0.3, 0.8, 0.9],
   'xgboost__colsample_bytree': [0.3, 0.8, 0.9],
   'xgboost__reg_alpha': [0.00003, 0.00006, 0.00009]}

  #xgboost = XGBRegressor(learning_rate=0.01,
                      #n_estimators=1000,
                      #max_depth=5,
                      #gamma=0.6,
                      #subsample=0.8,
                      #colsample_bytree=0.8,
                      #objective='reg:squarederror',
                      #nthread=-1,
                      #scale_pos_weight=1,
                      #seed=27,
                      #reg_alpha=0.00006,
                      #random_state=42)
] '''

tuned_parameters = [
  {'m_rf__n_estimators': [50, 200, 400],
  'm_rf__max_depth': [2, 4, 7],
  'm_rf__min_samples_split': [3, 5, 8],
  'm_rf__max_features': [3, 5, 8],
   'm_svr__epsilon': [0.004, 0.008, 0.0016],
   'm_svr__C': [10, 20, 30],
   'm_svr__gamma': [0.0001, 0.0003, 0.0007],
   'm_bag__n_estimators': [5, 10, 15],
   'm_bag__max_samples': [1, 5, 10],
   'm_bag__max_features': [1, 5, 10],
   'm_gbr__n_estimators': [250, 500, 750],
   'm_gbr__learning_rate': [0.01, 0.03, 0.05],
   'm_gbr__max_features': ['auto', 'sqrt', 'log2'],
   'm_gbr__min_samples_leaf': [7, 15, 30],
   'm_gbr__min_samples_split': [7, 10, 20],
   'm_gbr__loss': ['huber', 'ls', 'lad'],
   'xgboost__n_estimators': [500, 1000, 1500],
   'xgboost__learning_rate': [0.01, 0.03, 0.05],
   'xgboost__objective': ['reg:squarederror'],
   'xgboost__max_depth': [3, 5, 10],
   'xgboost__gamma': [0.3, 0.6, 0.9],
   'xgboost__subsample': [0.3, 0.8, 0.9],
   'xgboost__colsample_bytree': [0.3, 0.8, 0.9],
   'xgboost__reg_alpha': [0.00003, 0.00006, 0.00009]}
]

#change from neg_mean_squared_log_error to mean_squared_error as Likes are already in Log scale.
metric = 'neg_mean_squared_error'

cv_strategy = KFold(n_splits=4, shuffle=True)
grid_search = RandomizedSearchCV(model,tuned_parameters,n_iter=20,scoring=metric,cv=cv_strategy)
''' grid_search = GridSearchCV(
    model, tuned_parameters, scoring=metric, n_iter=10, cv=cv_strategy
) '''
grid_search.fit(X_train, y_train);
print('Finished!')

print("Best parameters set found on development set:")
print()
print(grid_search.best_params_)

print("Grid scores on development set:")
print()
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))


In [None]:
model.get_params()

##Model Functions

In [None]:
#Stacking
m_rf = RandomForestRegressor(n_estimators=200,
                              max_depth=2,
                              min_samples_split=8,
                              min_samples_leaf=8,
                              max_features=None,
                              oob_score=True,
                              random_state=42)

m_svr = SVR(C=20, epsilon=0.004, gamma=0.0001)

m_ab = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=5),
                        n_estimators=200,
                        loss='exponential',
                        learning_rate=0.01,random_state=42)
m_bag = BaggingRegressor(n_estimators=15,max_samples=10,max_features=5,random_state=42)

xgboost = XGBRegressor(learning_rate=0.01,
                      n_estimators=1000,
                      max_depth=5,
                      gamma=0.3,
                      subsample=0.9,
                      colsample_bytree=0.8,
                      objective='reg:squarederror',
                      nthread=-1,
                      scale_pos_weight=1,
                      seed=27,
                      reg_alpha=0.00009,
                      random_state=42)
m_gbr = GradientBoostingRegressor(n_estimators=500,
                              learning_rate=0.03,
                              max_depth=4,
                              max_features='sqrt',
                              min_samples_leaf=30,
                              min_samples_split=20,
                              loss='ls',
                              random_state=42)
m_catboost = CatBoostRegressor(n_estimators=1000,
                              learning_rate=0.01,
                              rsm=0.4,
                              random_state=42)
stack_mod = StackingRegressor(
      estimators=[
          ('m_rf', m_rf),
          # ('m_ridge', m_ridge),
          ('m_svr', m_svr),
          ('m_bag', m_bag),
          # ('m_ab', m_ab),
          ('m_gbr', m_gbr),
          ('xgboost', xgboost),
          # ('m_catboost', m_catboost),
      ],
      n_jobs=-1,
      verbose=0.5)

In [None]:
def run_gradient_boosting(clf, fit_params, train, target, test):
  N_SPLITS = 5
  oofs = np.zeros(len(train))
  preds = np.zeros((len(test)))

  # target = train[TARGET_COL]
  features = list(train.columns.values)
  folds = StratifiedKFold(n_splits = N_SPLITS,random_state=42)
  stratified_target = pd.qcut(target, 10, labels = False, duplicates='drop')

  feature_importances = pd.DataFrame()

  for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
    print(f'\n------------- Fold {fold_ + 1} -------------')

    ### Training Set
    X_trn, y_trn = train.iloc[trn_idx], target.iloc[trn_idx]

    ### Validation Set
    X_val, y_val = train.iloc[val_idx], target.iloc[val_idx]
    
    _ = clf.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], **fit_params)
    fold_importance = pd.DataFrame({'fold': fold_ + 1, 'feature': features, 'importance': clf.feature_importances_})
    feature_importances = pd.concat([feature_importances, fold_importance], axis=0)

    ### Instead of directly predicting the classes we will obtain the probability of positive class.
    preds_val = clf.predict(X_val)
    # pred_val[pred_val<0] = 0
    preds_test = clf.predict(test)

    fold_score = av_metric(y_val, preds_val)
    print(f'\nAV metric score for validation set is {fold_score}')

    oofs[val_idx] = preds_val
    preds += preds_test / N_SPLITS


  oofs_score = av_metric(target, oofs)
  print(f'\n\nAV metric for oofs is {oofs_score}')

  feature_importances = feature_importances.reset_index(drop = True)
  fi = feature_importances.groupby('feature')['importance'].mean().sort_values(ascending = False)[:20][::-1]
  fi.plot(kind = 'barh', figsize=(12, 6))

  return oofs, preds , fi

def av_metric(y_true, y_pred):
  return np.sqrt(mean_squared_log_error(np.exp(y_true), np.exp(y_pred)))

## Main code

### Data pre-processing

In [None]:
tidy_df_train,tidy_df_test = tidyer(df_train,df_test)

#pixel selection
pixel_index_selection = [2043, 1947, 2046, 2142, 1893, 2139, 1986, 2238, 1944, 2079, 1950, 2337, 1848, 1752, 1509, 1797, 1983, 2181, 2040, 2082, 1413, 1989, 
                            1851, 2274, 1656, 1566, 2241, 1605, 1896, 1992, 1701, 2178, 1599, 2268, 2235, 1695, 2271, 1560, 2175, 1890, 2364, 2085, 1293, 2367, 
                            1662, 1290, 1287, 1464, 1857, 1414, 1296, 1488, 2370, 525, 1506, 1368, 2340, 1284, 1761, 1299, 1884, 1485, 1758, 1677, 1791, 1117, 
                            1680, 1302, 2334, 2145, 1305, 1779, 1389, 1212, 1308, 1584, 1773, 1776, 1114, 1392, 1788, 1899, 2160, 1770, 1860, 2157, 1881, 1206, 
                            1581, 1395, 1872, 1386, 1869, 1767, 1749, 2361, 2344, 1473, 1203]

tidy_df_train = pd.concat([tidy_df_train,img_extr(img_train,pixel_index_selection)],axis=1)
tidy_df_test = pd.concat([tidy_df_test,img_extr(img_test,pixel_index_selection)],axis=1)
img_clust_train,img_clust_test = img_clust(img_train,img_test)
tidy_df_train,tidy_df_test = pd.concat([tidy_df_train,img_clust_train],axis=1),pd.concat([tidy_df_test,img_clust_test],axis=1)


num_fea = ['Name length','Profile Creation Year','Num of Followers','Num of People Following','Num of Status Updates','Num of Direct Messages','Avg Daily Profile Visit Duration in seconds',
           'Avg Daily Profile Clicks','Img Num Black Pixels', 'Img Average Red level',	'Img Average Green level',	'Img Average Blue level',
           'Sum Selected Pixels'] + ['pixel_' + str(i) for i in sorted(pixel_index_selection)]
cat_fea = ['Cover Image Status','Profile Verification Status','Time Region','Location Public Visibility','User Language','Profile Category','Img Spectral Clusters','Img KMeans Clusters']
prep_df_train,prep_df_test = standard_encoder(tidy_df_train,tidy_df_test,num_fea,cat_fea)

### Feature selection

In [None]:
fea_sel = feature_select(prep_df_train)
all_features = fea_sel.index[1:]

cols_x = all_features[0:35]   #Top correlated features. Optimal feature selection = top 35
cols_y = 'Likes'

### Modelization

In [None]:
#catbst_model = CatBoostRegressor(n_estimators = 3000,
#                       learning_rate = 0.01,
#                       rsm = 0.4, ## Analogous to colsample_bytree
#                       random_state=2054,
#                       )
#fit_params = {'verbose': 200, 'early_stopping_rounds': 200}
#cb_oofs,pred_cb, fi = run_gradient_boosting(catbst_model, fit_params, prep_df_train[cols_x], prep_df_train[cols_y], prep_df_test[cols_x]);

stack_mod.fit(prep_df_train[cols_x],prep_df_train[cols_y]);

In [None]:
np.sqrt(mean_squared_log_error(np.exp(prep_df_train[cols_y]), np.exp(stack_mod.predict(prep_df_train[cols_x]))))

### Submission

In [None]:
import csv 

pred = stack_mod.predict(prep_df_test[cols_x])
pred = np.abs(pred)
with open('submission.csv', 'w') as csvfile:
  csvwriter = csv.writer(csvfile)  
  csvwriter.writerow(['Id', 'Predicted'])
  for i in range(len(pred)):
    csvwriter.writerow([df_test['Id'].iloc[i], np.exp(pred[i])])

# **Annex B - Neural Networks Approach (unsubmitted)**

## **Environment Set-up**



Taking care of colab disconnection :

+ https://medium.com/@shivamrawat_756/how-to-prevent-google-colab-from-disconnecting-717b88a128c0

In [None]:
cp -R /content/drive/MyDrive/02_ETUDES/01-MILA-UDEM/IFT6758/Kaggle/* .

In [None]:
!unzip -qq ift6758-a20.zip

In [None]:
# !pip install webcolors
!pip install catboost
!pip install lightgbm

In [None]:
!pip install scikit-image
!pip install opencv-python
# !git clone https://github.com/MeAmarP/sample_imgs.git

## **Import Lib**

In [None]:
import random
import numpy as np
import pandas as pd

from numpy import argmax
from numpy import loadtxt

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import backend as K

In [None]:
import csv
import math
import pdb

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, 
    VotingRegressor, StackingRegressor, BaggingRegressor,
)
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, LinearSVR

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer
from sklearn import preprocessing


import seaborn as sns
from  matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn import linear_model
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from scipy.special import boxcox1p, inv_boxcox1p
from scipy.stats import boxcox_normmax
from scipy.stats import skew, norm
from scipy.stats import yeojohnson
from sklearn.preprocessing import PowerTransformer

pd.set_option('display.max_columns', None)

In [None]:
# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree

# Ensembling
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier

# Metrics
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from tensorflow.keras import layers
from keras.optimizers import SGD,Adam,RMSprop


In [None]:
#Imports
import skimage
import cv2 as cv2
import sklearn
import pandas as pd
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

#Get Python and OpenCV Version
print('OpenCV-Python Lib Version:', cv2.__version__)
print('Python Version:',sys.version)

##**Tabular Dataset**

###**Data loading (and overview)**

In [None]:
data_train = pd.read_csv('train.csv') #converters and parse_dates could be usefull
data_test = pd.read_csv('test.csv')
data_train['source']='train'
data_test['source']="test"

In [None]:
def process_column_names(df):
        return df.rename(columns=lambda x: x.replace(' ', '_').replace('?', '').lower())

data_train = process_column_names(data_train)
data_test = process_column_names(data_test)

In [None]:
# data_submission = pd.read_csv('sample_submission.csv')
target_colname ='num_of_profile_likes'
data_all =pd.concat([data_train,data_test])
print(data_train.shape)
print(data_test.shape)
print(data_all.shape)
data_all.head()

In [None]:
data_all.isnull().sum()

In [None]:
data_all.describe()

### **Indexing/Multi-Indexing**

In [None]:
print(data_all.shape)
# test = data_all
# Check if there is null in potential key columns
# print(test.isnull().sum())
# Check if there is duplicates in keys by looking at the shape
keys = ['id', 'user_name','profile_image']
for key in keys:
  print(key)
  test = data_all
  test = test.drop_duplicates(key)
  print(test.shape)

In [None]:
keys.append('source')
data_all = data_all.set_index(keys=keys)
df = data_all

### **Types - All Features**

In [None]:
# df.info()
for col in df.columns:
  print(df[col].dtypes)
  print(df[col].unique())

### **Cleaning & Uniformization - All Features**

In [None]:
df = df.replace(r'^\s*$', np.NaN, regex=True)
df.location_public_visibility = df.location_public_visibility.replace('??','unknown')
df.profile_verification_status = df.profile_verification_status.replace('Pending','not verified')

### **Imputation - All Features**

+ In a first time, based on the number of Null from , I will deleted some of features that I arbitrary consider informationless

+ Replacing Numerical missing data by the average of the column

+ I split dataset per col dtype , have to be carefull with droduplicates() since i did not create a col index as key for each subset


+ Careful / there is something to do with location and colors

+ https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.convert_dtypes.html
+ https://machinelearningmastery.com/knn-imputation-for-missing-values-in-machine-learning/

In [None]:
df.profile_category = df.profile_category.fillna('unknown')
df.profile_cover_image_status = df.profile_cover_image_status.fillna('not set')
df.avg_daily_profile_visit_duration_in_seconds = df.avg_daily_profile_visit_duration_in_seconds.fillna(df.avg_daily_profile_visit_duration_in_seconds.median())
df.avg_daily_profile_clicks = df.avg_daily_profile_clicks.fillna(df.avg_daily_profile_clicks.median())
df.utc_offset = df.utc_offset.fillna(0)

df.profile_text_color = df.profile_text_color.fillna('ffffff')
df.profile_page_color = df.profile_page_color.fillna('ffffff')
df.profile_theme_color = df.profile_theme_color.fillna('ffffff')

### **Conversion - All Features**

In [None]:
df.profile_creation_timestamp = pd.to_datetime(df.profile_creation_timestamp)
col = "profile_creation_timestamp"
df[str(col+"_year")] = df[col].dt.year
df[str(col+"_week")] = df[col].dt.week
df[str(col+"_month")] = df[col].dt.month
df[str(col+"_day")] = df[col].dt.day
# df[str(col+"_day_name")] = df[col].dt.day_name
df[str(col+"_week_day")] = df[col].dt.weekday
df[str(col+"_hour")] = df[col].dt.hour
df[str(col+"_quarter")] = df[col].dt.quarter
# df[str(col+"_minute")] = df[col].dt.minute
# df[str(col+"_seconde")] = df[col].dt.second
del(df[col])


df.personal_url = df.personal_url.apply(lambda x:True if (pd.notnull(x)) else False)


def hex_to_rgb(hex):
  return tuple(int(hex[i:i+2],16) for i in (0,2,4))

df.profile_text_color = df.profile_text_color.replace("#","").apply(lambda x: hex_to_rgb(x))

suffix="profile_text_color"
col = [x+'_{}'.format(suffix) for x in ["red","green","blue"]]
df_temp=pd.DataFrame(df[suffix].values.tolist(), columns=col, index=df.index)
df = pd.concat([df,df_temp], axis='columns')
del(df[suffix])

df.profile_page_color = df.profile_page_color.replace("#","").apply(lambda x: hex_to_rgb(x))

suffix="profile_page_color"
col = [x+'_{}'.format(suffix) for x in ["red","green","blue"]]
df_temp = pd.DataFrame(df[suffix].values.tolist(), columns=col, index=df.index)
df = pd.concat([df,df_temp], axis='columns')
del(df[suffix])

df.profile_theme_color = df.profile_theme_color.replace("#","").apply(lambda x: hex_to_rgb(x))

suffix="profile_theme_color"
col = [x+'_{}'.format(suffix) for x in ["red","green","blue"]]
df_temp=pd.DataFrame(df[suffix].values.tolist(), columns=col, index=df.index)
df = pd.concat([df,df_temp], axis='columns')
del(df[suffix])


### <font color='red'>**Features Selection - not implemented**</font>

+ https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/

In [None]:
# corrMatrix = df_all.loc[:, df_all.columns != target_colname].corr()
# sns.heatmap(corrMatrix, annot=True)
# plt.show()

In [None]:
# covMatrix = np.cov(df_all.loc[:, df_all.columns != target_colname],bias=True)
# sns.heatmap(covMatrix, annot=True, fmt='g')
# plt.show()

In [None]:
# exclusion = ['location','user_time_zone','profile_text_color','profile_page_color','profile_theme_color']
# exclusion = ['location','user_time_zone']
# exclusion.append("utc_offset")

# df = df.drop(exclusion, axis=1)
# df = df.drop_duplicates()

# keep_col = [ 'profile_verification_status', 'profile_category',
#        'num_of_followers', 'num_of_people_following', 'num_of_status_updates',
#        'num_of_direct_messages', 'avg_daily_profile_visit_duration_in_seconds',
#        'avg_daily_profile_clicks', 'profile_creation_timestamp_year',
#         'num_of_profile_likes']

# df = df[keep_col]
# df = df.drop_duplicates()

# # print(df.shape)
# df.info()


### <font color='black'>**Encoding - Nominal Features**</font>

+ https://machinelearningmastery.com/why-one-hot-encode-data-in-machine-learning/
+ https://machinelearningmastery.com/one-hot-encoding-for-categorical-data/

+ https://stackoverflow.com/questions/40312128/how-to-lower-all-the-elements-in-a-pandas-dataframe

In [None]:
df_bool = df.select_dtypes(exclude=['float64','int64','object','datetime64[ns, UTC]'])
df_nominal = df.select_dtypes(exclude=['float64','int64','bool','datetime64[ns, UTC]'])

df_numerical = df.select_dtypes(exclude=['object','bool','datetime64[ns, UTC]'])
df_target = df_numerical.loc[:, df_numerical.columns == target_colname]
df_numerical = df_numerical.loc[:, df_numerical.columns != target_colname]

df_datetime = df.select_dtypes(exclude=['float64','int64','object','bool'])
df_numerical_not_scaled =  df.select_dtypes(exclude=['object','bool','datetime64[ns, UTC]'])


In [None]:
# if the number of unique is to high there is probably a better way to (otherwise it can add a lot of dimensions)
test=df_nominal
for col in test.columns:
  print(col)
  print(len(test[col].unique()))

In [None]:
# example of a dummy variable encoding - taking care of redonduncy
from numpy import asarray
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer

for col in df_nominal.columns:
  df_nominal[col] = pd.get_dummies(df_nominal[col], prefix=str(col+"_"))
for col in df_bool.columns:
  df_bool[col] = df_bool[col].replace(False,0).replace(True,1)


### <font color='red'>**Outliers - Numerical Features**</font>


In [None]:
q = df_numerical[df_numerical.columns].quantile([0.25, 0.95])
for col in df_numerical.columns:
  print(col)
  df_numerical[col] = df_numerical[col].clip(*q[col])


### <font color='black'>**Scaling - Numerical Features**</font>

+ https://machinelearningmastery.com/how-to-transform-target-variables-for-regression-with-scikit-learn/

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, Normalizer, RobustScaler


# scaler = StandardScaler()
scaler = MinMaxScaler()
# scaler = PowerTransformer()
# scaler = Normalizer()
# scaler = RobustScaler()
target_scaler = scaler

df_numerical[df_numerical.columns] = scaler.fit_transform(df_numerical[df_numerical.columns])
df_numerical.head()
# df_target[df_target.columns] = target_scaler.fit_transform(df_target[df_target.columns])

# scaled_data_arr = scaler.fit_transform(df_numerical)
# df_numerical.update(pd.DataFrame(scaled_data_arr, columns=df_numerical.columns))
# pow_transformed_data_arr = scaler.fit_transform(df_numerical.values)
# # print(df_numerical.head())
# # df_numerical.update(pd.DataFrame(pow_transformed_data_arr, columns=df_numerical.columns))
# # print(df_numerical.head())





## **Compute PCA Principales components**

In [None]:
from sklearn.decomposition import PCA


df_temp = df_numerical
nb_cp = 5
pca = PCA(n_components=nb_cp)
col = ['cp_'+str(x) for x in range(1,nb_cp+1)]
print(col)
principalComponents = pca.fit_transform(df_temp.loc[:, df_temp.columns != target_colname])
df_principal = pd.DataFrame(data = principalComponents, columns = col, index=df_temp.index)

df_principal.to_csv('df_PCA_components.csv')
df_principal.head()


## **Compute VGG Features**

### **Loading Data (images path)**

In [None]:
env_path = os.path.join(os.getcwd(), 'train_profile_images/profile_images_train')
img_list_train = os.listdir(env_path)
img_list_train = [os.path.join(env_path,x) for x in img_list_train]
env_path = os.path.join(os.getcwd(), 'test_profile_images/profile_images_test')
img_list_test = os.listdir(env_path)
img_list_test = [os.path.join(env_path,x) for x in img_list_test]
img_list = img_list_train + img_list_test

# print(img_list_train[1])
# print(img_list_test[1])
# print(img_list[1])
img_test_path = os.path.join(env_path,img_list[1])
print(img_test_path)
img_test = cv2.imread(img_test_path,cv2.IMREAD_UNCHANGED)


### **Images Preparation**


In [None]:

MainImgBin = cv2.imread(img_test_path,cv2.IMREAD_UNCHANGED)
print('DatatypeClass of Image:',type(MainImgBin))
print('Shape/Size of Binary Img:', MainImgBin.shape)

img = cv2.imread(img_test_path,cv2.IMREAD_UNCHANGED)
print(img.shape)
rows,cols,dim = img.shape

down=left=right=top=3

#Slicing using ROI
cropped = img[top:rows-down,right:cols-left].copy()

# print(cropped.shape)
# plt.subplot(121),plt.imshow(img,cmap='gray'),plt.title('Original',color='c')
# plt.subplot(122),plt.imshow(cropped,cmap='gray'),plt.title('Cropped',color='c')

import random 
# initializing list  
angle_list = [i for i in range(0,360,90)]
# angle_val = random.choice(test_list) 
for angle_val in angle_list:
  print(angle_val)
  scaleFactor=1
  rows,cols,dim = img.shape
  imgCenter = (cols-1)/2.0,(rows-1)/2.0
  #Calculate an affine matrix of 2D rotation. 
  rotateMat = cv2.getRotationMatrix2D(imgCenter,angle_val,scaleFactor)
  # Apply an affine transformation to an image. 
  out_img = cv2.warpAffine(img,rotateMat,(cols,rows))
  # plt.figure(figsize=(10,10))
  # plt.subplot(1,2,1), plt.imshow(img,cmap='gray') ,plt.title('Original Image',color='c')
  plt.subplot(1,2,2), plt.imshow(out_img,cmap='gray'), plt.title('Rotated Image',color='c')
  plt.show()


### **VGGs Features Extraction**

In [None]:
# Using the vgg16 model as a feature extraction model
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import decode_predictions
from keras.applications.vgg16 import VGG16
from keras.applications.vgg19 import VGG19

from keras.models import Model
from keras.layers import Dense, Conv2D, MaxPooling2D
from keras.layers import Flatten
from keras.models import Model
from pickle import dump

# get the input shape
d1,d2,d3 = img_test.shape

# load model
# load model without classifier layers
model_VGG16 = VGG16(include_top=False, input_shape = (d1, d2, d3))
# model_VGG16 =  VGG16(weights = "imagenet", include_top = False, input_shape = (d1, d2, d3))
# model_VGG19 = VGG19(weights = "imagenet", include_top = False, input_shape = (d1, d2, d3))


# set the layers non trainable
model_VGG = model_VGG16
for layer in model_VGG.layers[:5]:
   layer.trainable = False
  
# remove the output layer
model_VGG = Model(inputs=model_VGG.inputs, outputs=model_VGG.layers[-2].output)
# vizualize the layers
# print(model_VGG.summary())

# add new classifier layers
flat1 = Flatten()(model_VGG.layers[-1].output)
class1 = Dense(1024, activation='relu')(flat1)
class2 = Dense(512, activation='relu')(class1)
class3 = Dense(256, activation='relu')(class2)
class4 = Dense(64, activation='relu')(class3)
output = Dense(6, activation='softmax')(class4)

# define final model
model_final = Model(inputs=model_VGG.inputs, outputs=output)
print(model_final.summary())

In [None]:
# define cnn model
# def define_model():
# 	model_cnn_baseline = Sequential()
# 	model_cnn_baseline.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', input_shape=(d1, d2, d3)))
# 	model_cnn_baseline.add(MaxPooling2D((2, 2)))
# 	model_cnn_baseline.add(Flatten())
# 	model_cnn_baseline.add(Dense(100, activation='relu', kernel_initializer='he_uniform'))
# 	model_cnn_baseline.add(Dense(10, activation='softmax'))
# 	# compile model
# 	# opt = Adam(lr=0.01, momentum=0.9)
# 	model_cnn_baseline.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])
# 	return model_cnn_baseline


In [None]:
results = []
col=[]
# load an image from file
for img_path in img_list:# for img_name in img_list[1:3]:

  img_path_splited = img_path.split('/')
  img_name = img_path_splited[int(len(img_path_splited)-1)]

  # print(img_path)
  image = load_img(img_path, target_size=(d1, d2))
  image = img_to_array(image)
  image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
  # print(image.shape)
  # prepare the image for the VGG model
  image = preprocess_input(image)
  # get extracted features

  features = model_final.predict(image)
  # mod = define_model()
  # features = mod.predict(image)

  # print(features.shape)
  
  temp = features[0].tolist()
  temp.append(img_name)
  results.append(temp)
  # save to file
  # dump(features, open('dog.pkl', 'wb'))
print("done")

In [None]:
nb_features = features.shape[1]
col = ['feature_'+str(x) for x in range(1,nb_features+1)]
col.append('profile_image')
df_vgg_results = pd.DataFrame(results,columns = col)
df_vgg_results.to_csv('df_VGG_features.csv')
df_vgg_results.head()

## **Merge Dataframes**

In [None]:
df_all_not_scaled = pd.concat([df_nominal,df_numerical_not_scaled,df_bool], axis='columns')
print(df_all_not_scaled.shape)
df_all_not_scaled.head()

In [None]:
df_all_scaled = pd.concat([df_nominal,df_numerical,df_bool,df_target], axis='columns')
print(df_all_scaled.shape)
df_all_scaled.head()

In [None]:
df_temp = df_all_scaled
df_temp = df_temp.reset_index()
df_all_VGG = df_temp.merge(df_vgg_results, left_on='profile_image', right_on='profile_image', how='left')
df_all_VGG = df_all_VGG.set_index(keys)
df_all_VGG.head()

In [None]:
df_temp = df_all_scaled
df_all_PCA = pd.concat([df_temp,df_principal], axis='columns')
df_all_PCA.head()

In [None]:
print(df_all_scaled.columns)
keep_col = ['profile_cover_image_status', 'profile_verification_status', 'location',
        'user_language',
       'profile_category', 'utc_offset', 'num_of_followers',
       'num_of_people_following', 'num_of_status_updates',
       'num_of_direct_messages', 'avg_daily_profile_visit_duration_in_seconds',
       'avg_daily_profile_clicks', 'profile_creation_timestamp_year',
       'profile_creation_timestamp_week', 'profile_creation_timestamp_month',
       'profile_creation_timestamp_day', 'profile_creation_timestamp_week_day',
       'profile_creation_timestamp_hour', 'profile_creation_timestamp_quarter',
       'red_profile_text_color', 'green_profile_text_color',
       'blue_profile_text_color', 'red_profile_page_color',
       'green_profile_page_color', 'blue_profile_page_color',
       'red_profile_theme_color', 'green_profile_theme_color',
       'blue_profile_theme_color', 'personal_url',
       'is_profile_view_size_customized', 'num_of_profile_likes']

df_all_reduced = df_all_scaled[keep_col]

df_all_reduced.info()

### **Metric Definition +  Train-Test split**

In [None]:

# df_all_VGG
# df_all_PCA
# df_all_not_scaled
# df_all_scaled
# df_all_reduced
df_used = df_all_VGG
print(df_used.shape)
df_train= df_used[df_used.index.get_level_values('source').isin(['train'])]

df_train[target_colname] = df_train[target_colname].apply(np.log1p)
print(df_train[target_colname].describe())

df_test= df_used[df_used.index.get_level_values('source').isin(['test'])]

In [None]:
df_used.head()

In [None]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [None]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))

#### <font color='black'>*Test Target Transformation*</font>



In [None]:
# print(df_target.describe())
# test = pd.DataFrame(target_scaler.inverse_transform(df_target),columns=['pred'])
# print(test.describe())

## **C- Neural Network Approach**


In [None]:
X_train, y_train = df_train.loc[:, df_train.columns != target_colname], df_train[target_colname]
X_test = df_test.loc[:, df_test.columns != target_colname]
print(X_train.shape)

In [None]:
# X, y = df_train.loc[:, df_train.columns != target_colname], df_train[target_colname]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# print(X_train.shape)

In [None]:

results=[]
epochs_list=[8,9,10,11,12,13,14,15]

for epochs in epochs_list :

  NN = Sequential()
  NN.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
  NN.add(Dense(X_train.shape[1]*3, kernel_initializer='normal', activation='relu'))
  NN.add(Dense(X_train.shape[1]*2, kernel_initializer='normal', activation='relu'))
  NN.add(Dense(1, kernel_initializer='normal'))

  learning_rate = 0.01
  decay_rate = learning_rate / (epochs)
  momentum = 0.8
  batch_size=32

  opt = SGD(lr=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False)
  NN.compile(optimizer = opt, loss = 'mean_squared_error', metrics =[root_mean_squared_error])
  
  NN.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)
  
  pred = NN.predict(X_test)
  pred = np.abs(pred)

  # print(rmsle(np.expm1(y_test), np.expm1(pred)))
  results.append(rmsle(np.expm1(y_test), np.expm1(pred)))

print(results)

In [None]:
sub_test = df_test.loc[:, df_test.columns != target_colname]
pred = NN.predict(sub_test)
pred_NN = pd.DataFrame(df_test.index.get_level_values('id').values, columns=["Id"])
pred_NN["Predicted"] = pd.DataFrame(np.expm1(pred))
# pred_NN["Predicted"] = pd.DataFrame(target_scaler.inverse_transform(pred),columns=['pred'])
print(pred_NN.shape)
print(pred_NN.head())
pred_NN.to_csv('submission_NN_last2.csv',index=False)