## Read me

You need the basic sklearn, numpy, pandas. Upload the train and test csv files in the same folder as the notebook. Or if using colab, upload these 2 files in the folders section. Run the cells in the same order.

In [None]:
!pip install pytz
!pip3 install xgboost

Let us import the required header files below.

In [1]:
import pandas as pd
import numpy as np
import datetime, pytz
import xgboost as xgb
from sklearn.svm import SVR
from sklearn import preprocessing 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_log_error

Now, we will read our data CSV file. 

In [2]:
data = pd.read_csv('train.csv')
data = data.drop_duplicates()

We will define the preprocessing functions that we have developed. 

In [3]:
#Preprocessing Function written for UTC Time
tz_l = pytz.all_timezones
def processF(text): 
    text = str(text)
    backup = text
    if text == "nan":
        return text
    #text = text.replace(',', '')
    words = text.split()
    
    finalWord = ""
    #Look for singular words, like Mumbai, Argentina 
    charr = "/"
    if len(words) == 1 and charr not in words:
        for i in tz_l:
            orig = i
            i = i.lower()
            if words[0] in i: 
                finalWord = orig
                #print("Word in consideration is", words[0])
                #print("Returned Word is", finalWord)
                #print("returned values are:", orig)
                return orig
    if len(words) > 1:
        backup = backup.replace(',', '')
        backup = backup.replace('/', '')
        words = backup.split()
        for x in words:
            for i in tz_l: 
                orig = i
                i = i.lower()
                if x in i:
                    #print("Word in consideration is", x)
                    #print("Returned Word is", i)
                    finalWord = orig
                    #print("Original and returned values are:", x, orig)
                    return orig
    if len(words) > 0:
        if charr in words[0] and len(words) == 1:
            wordStr = str(words)
            wordStr = wordStr.replace('/', ' ')
            wordStr = wordStr.replace(',', ' ')
            ww = wordStr.split()
            #print("Split shit is", ww)
            for x in ww:
                for i in tz_l:
                    orig = i 
                    i = i.lower()
                    if x in i:
                        finalWord = orig
                        #print("Original and returned values are:", x, orig)
                        return orig
                  
    return text

In [4]:
def simple_preprocess(text):
    if text == "nan":
        return text
    for i in tz_l: 
        orig = i
        i = i.lower()
        text = text.replace(',', '')
        text = text.lower()
        text_list = text.split()
        for j in text_list:
            if j in i:
                return orig
    return text

In [5]:
def utcprocess(strval):
    if strval == "nan":
        return strval
    if str(strval) in tz_l:
        return datetime.datetime.utcnow().replace(tzinfo=pytz.utc).astimezone(pytz.timezone(strval)).utcoffset().total_seconds()
    else: 
        return strval
    return strval

The function below is the main pre-processing function. This does all the skew removals, transforms, scalings. 

In [6]:
def preprocess(data_nan_adjusted):
    
    #find the presence of URL
    data_nan_adjusted['url_present'] = data_nan_adjusted["Personal URL"].isna().astype(int)
    
    #Extract information from date
    data_nan_adjusted["Date Created"] = pd.to_datetime(data_nan_adjusted["Profile Creation Timestamp"])
    #extract months and year info from profile date created
    data_nan_adjusted["Year Created"] = data_nan_adjusted["Date Created"].dt.year
    data_nan_adjusted["Months"] = data_nan_adjusted["Date Created"].dt.to_period('M').astype(int)
    # Converting time stamp to the age of the account: 2021 - year_of_creation
    data_nan_adjusted['Profile Creation Timestamp'] = data_nan_adjusted['Profile Creation Timestamp'].\
                                    apply(lambda x: 2021 - int(x.split(" ")[-1]))
    
    #fill na values of color
    data_nan_adjusted['Profile Text Color'].fillna('ffffff', inplace=True)
    data_nan_adjusted['Profile Page Color'].fillna('ffffff', inplace=True)
    data_nan_adjusted['Profile Theme Color'].fillna('ffffff', inplace=True)
    
    
    #frequency encoding for colors -> Performs the best for the columns. 
    enc_nom_1 = (data_nan_adjusted.groupby('Profile Text Color').size()) / len(data_nan_adjusted)
    enc_nom_2 = (data_nan_adjusted.groupby('Profile Page Color').size()) / len(data_nan_adjusted)
    enc_nom_3 = (data_nan_adjusted.groupby('Profile Theme Color').size()) / len(data_nan_adjusted)
    data_nan_adjusted['col1_encode'] = data_nan_adjusted['Profile Text Color'].apply(lambda x : enc_nom_1[x])
    data_nan_adjusted['col2_encode'] = data_nan_adjusted['Profile Page Color'].apply(lambda x : enc_nom_2[x])
    data_nan_adjusted['col3_encode'] = data_nan_adjusted['Profile Theme Color'].apply(lambda x : enc_nom_3[x])
    
    #fill na of Profile Cover Image Status based on profile view size customized
    data_nan_adjusted["Profile Cover Image Status"] = data_nan_adjusted["Profile Cover Image Status"].\
                                fillna(value=data_nan_adjusted['Is Profile View Size Customized?'])
    
    #replace it with numbers to find the correlation
    data_nan_adjusted["Profile Cover Image Status"].replace({"Not set":0 , "Set":1}, inplace=True)
    data_nan_adjusted["Profile Cover Image Status"] = data_nan_adjusted["Profile Cover Image Status"].astype(int)

    #fill nan by median after group by profile categories
    data_nan_adjusted['Avg Daily Profile Clicks'] = data_nan_adjusted['Avg Daily Profile Clicks']\
        .fillna(data_nan_adjusted.groupby(['Profile Category'])['Avg Daily Profile Clicks'].transform('median'))

    #fill median - this has more correlation with likes
    data_nan_adjusted['Avg Daily Profile Visit Duration in seconds']\
    .fillna(data_nan_adjusted['Avg Daily Profile Visit Duration in seconds'].median(), inplace=True)

    #onehot encoding for Profile category
    data_nan_adjusted['Profile Category'] = data_nan_adjusted['Profile Category'].str.replace(" ",'unknown')
    data_nan_adjusted = pd.concat([data_nan_adjusted, pd.get_dummies(data_nan_adjusted['Profile Category'], \
                                                                     dtype=int, drop_first=True)], axis=1)
    
    #label encoding for Profile Verification Status and Location Public Visibility
    data_nan_adjusted["Profile Verification Status"].replace({"Not verified": 0 , "Verified": 1, "Pending":0}, inplace=True)
    data_nan_adjusted["Location Public Visibility"].replace({"??": 0, 'Disabled': 1, 'Enabled': 2, 'disabled': 1, 'enabled': 2},\
                                                            inplace=True)

    data_nan_adjusted["Is Profile View Size Customized?"] = data_nan_adjusted["Is Profile View Size Customized?"].astype(int)
    
    
    #log transformations -> fixes skews
    data_nan_adjusted['Num of People Following log'] = np.log(1+data_nan_adjusted['Num of People Following'])
    data_nan_adjusted['Num of Followers log'] = np.log(1+data_nan_adjusted['Num of Followers'])
    data_nan_adjusted['Num of Status Updates log'] = np.log(1+data_nan_adjusted['Num of Status Updates'])
    data_nan_adjusted['Num of Direct Messages log'] = np.log(1+data_nan_adjusted['Num of Direct Messages'])
    data_nan_adjusted['Num of Direct Messages log'] = np.log(1+data_nan_adjusted['Num of Direct Messages'])
    
    #one hot encoding for user language
    data_nan_adjusted = pd.concat([data_nan_adjusted, pd.get_dummies(data_nan_adjusted['User Language'].\
                                                                     str.lower(), dtype=int, drop_first=True)], axis=1)
    
    #robust scalar for profile clicks robust to remove outliers and log transformation
    scaler = preprocessing.RobustScaler(quantile_range=(25, 75)) 
    data_nan_adjusted[['Avg Daily Profile Clicks robust']] = np.log(1+scaler.\
                                            fit_transform(data_nan_adjusted[['Avg Daily Profile Clicks']]))
    
    #Quantile transformation for Daily Profile Visit Duration -> makes the shape bell curve esque
    Visit_duration_transform = preprocessing.QuantileTransformer(output_distribution='normal')
    data_nan_adjusted[['Avg Daily Profile Visit Duration in seconds robust']] = Visit_duration_transform.\
                                fit_transform(data_nan_adjusted[['Avg Daily Profile Visit Duration in seconds']])
    
    data_nan_adjusted['Location'] =  data_nan_adjusted['Location'].apply(processF)
    data_nan_adjusted['Location'] =  data_nan_adjusted['Location'].apply(simple_preprocess)

    data_nan_adjusted['UTC Offset'].fillna(data_nan_adjusted['UTC Offset'].mean(), inplace = True)
    
    #Transform improves correlation the most
    data_nan_adjusted[['UTC Offset']] = scaler.fit_transform(data_nan_adjusted[['UTC Offset']])
    
    return data_nan_adjusted

In [7]:
#drop columns based on correaltion with num of likes. This improves regession output
def drop_cols(data_nan_adjusted1, prevVar, test = False):
    data_nan_adjusted1 = prevVar.copy()
    data_nan_adjusted1.drop(['Personal URL','Id', 'User Name', 'Personal URL', 'Year Created', 'Date Created', 'Profile Image'], axis=1, inplace=True)
    data_nan_adjusted1.drop(['Location', 'User Language', 'User Time Zone'], axis=1, inplace=True)
    data_nan_adjusted1.drop(['Profile Text Color', 'Profile Page Color', 'Profile Theme Color'], axis=1, inplace=True)
    data_nan_adjusted1.drop(['Profile Category', 'Num of People Following', 'Num of Followers', 'Num of Status Updates', 'Num of Direct Messages' ], axis=1, inplace=True)
    data_nan_adjusted1.drop(['Avg Daily Profile Visit Duration in seconds', 'Avg Daily Profile Clicks'], axis=1, inplace=True)
    
    if test:
        data_nan_adjusted1.drop(['pt', 'es', 'en-gb', 'ru', 'it', 'nl', 'ca',
       'de', 'fr', 'id', 'fi', 'he', 'pl', 'no', 'ro', 'th',
       'zh-cn', 'cs', 'sv', 'hu'], axis=1, inplace=True)
    else:
        data_nan_adjusted1.drop(['ca', 'cs', 'da', 'de', 'el', 'en-gb', 'es', 'fi', 'fr', 'hu', 'id', 'it', 'nl', 'pl','ru', 'pt', 'sk', 'sr', 'sv', 'th', 'uk', 'zh-tw', 'zh-cn'], axis=1, inplace=True)
    
    return data_nan_adjusted1

In [8]:
def scaling(data_nan_adjusted1):
    features = data_nan_adjusted1[data_nan_adjusted1.columns.difference(['Num of Profile Likes'])] 
    scaler = preprocessing.StandardScaler() 
    minmax_df = scaler.fit_transform(features)
    return minmax_df

In [9]:
data_nan_adjusted = preprocess(data.copy())
data_nan_adjusted1 = drop_cols(data_nan_adjusted.copy(), data_nan_adjusted, False)
minmax_df = scaling(data_nan_adjusted1)



In [10]:
#taking log of labels since it is skewed
labels = np.log(data_nan_adjusted1['Num of Profile Likes'] + 1)
features = data_nan_adjusted1[data_nan_adjusted1.columns.difference(['Num of Profile Likes'])]

Model defined for prediction

In [11]:
estimators = [
     ('bag svr', BaggingRegressor(SVR(kernel = 'rbf', C=1), random_state = 0)),
     ('adaboost', AdaBoostRegressor(n_estimators=100, random_state = 0)),
     ('xgb', xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 4, alpha = 10, random_state = 0)),
    ('ridge', Ridge(normalize = False, alpha = 5.0)),
    ('rf', RandomForestRegressor(oob_score = True, random_state = 0))
]

reg = StackingRegressor(estimators=estimators, final_estimator=BaggingRegressor(SVR(kernel = 'linear', C=1), n_estimators=50, random_state = 0))

Cross Validation strategy used

In [39]:
####K FOLD CROSS VALIDATION
sumCount = 0
count = 0
from sklearn.model_selection import KFold
kfold = KFold(n_splits=4, shuffle=True, random_state=42)
for train_index, test_index in kfold.split(minmax_df):
    X_train, X_test = minmax_df[train_index], minmax_df[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    reg.fit(X_train, y_train)
    
    y_pred = np.floor(np.exp(reg.predict(X_test))) - 1
    test_y_exp = np.exp(y_test) - 1
    y_pred[y_pred<0] = 0
    err_m = np.sqrt(mean_squared_log_error(y_pred, test_y_exp))
    sumCount = sumCount + err_m
    count += 1
    
    print("Error_m value is for test", err_m)
print("Overall K-Fold CV RMSLE is", (sumCount / count))

Error_m value is for test 1.7108834327657283
Error_m value is for test 1.7364874684993097
Error_m value is for test 1.713054421797459
Error_m value is for test 1.735657993947408
Overall K-Fold CV RMSLE is 1.7240208292524761


Code for final prediction of test data

In [40]:
reg2 = StackingRegressor(estimators=estimators, final_estimator=BaggingRegressor(SVR(kernel = 'linear', C=1), n_estimators=50))
reg2.fit(minmax_df, labels)

StackingRegressor(estimators=[('bag svr',
                               BaggingRegressor(base_estimator=SVR(C=1))),
                              ('adaboost', AdaBoostRegressor(n_estimators=100)),
                              ('xgb',
                               XGBRegressor(alpha=10, base_score=None,
                                            booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.3, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=0.1,
                                            m...
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
            

In [41]:
test_d = pd.read_csv('test.csv')

#preprocess test data
data_nan_adjusted_t = preprocess(test_d.copy())
data_nan_adjusted1_t = drop_cols(data_nan_adjusted_t.copy(), data_nan_adjusted_t, True)
data_nan_adjusted1_t_array = scaling(data_nan_adjusted1_t)



Predict and Generate submission csv

In [42]:
#predict num of likes and create Submission.csv
y_pred_2 = np.floor(np.exp(reg2.predict(data_nan_adjusted1_t_array))) - 1
y_pred_2[y_pred_2 < 0] = 0
pd.set_option('display.max_columns', None)
t_dataframe=pd.DataFrame(y_pred_2, columns=['predicted'])
t_dataframe["Id"] = test_d.Id
columns_titles = ["Id","predicted"]
t_dataframe=t_dataframe.reindex(columns=columns_titles)
t_dataframe.to_csv("Submission.csv", index = False)

Hence, the file needed for submission is generated. 