In [1]:
# Imports
import os
import tensorflow as tf
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Set random seeds (for reproducibility requirement)
os.environ['PYTHONHASHSEED']=str(1)
tf.random.set_seed(1)
np.random.seed(1)
random.seed(1)

In [None]:
df_train = pd.read_csv('/kaggle/input/ift6758-a20/train.csv')
df_test = pd.read_csv('/kaggle/input/ift6758-a20/test.csv')
df_train.head()

As seen in the matrix below, we see that all numerical features are only slightly correlated with our target variable, we can therefore keep them all for now.

In [None]:
#Check which features ar ehighly correlated with the target variable
correlation_matrix = df_train.corr()[['Num of Profile Likes']].sort_values('Num of Profile Likes')
correlation_matrix

Let's see the distribution of our target variable: Number of Profile Likes. Before that, let's make sure that there are no NaN values.

In [None]:
df_train['Num of Profile Likes'].isnull().sum()

There are no missing values for our target variables. Let's keep going.

In [None]:
sns.distplot(df_train['Num of Profile Likes'], bins=10, kde=True)
plt.ylabel('Count')
plt.title('Profile Likes Distribution')
plt.show()

We can see that our target variable is heavily right skewed. This usually indicates that there are some outliers. Let's detect those outliers.

In [None]:
#Print a boxplot for our target variable
plt.figure(figsize=(10,5))
sns.boxplot(x=df_train['Num of Profile Likes'])
plt.show()

Let's understand our features a bit more with respect to the target variable

In [None]:
fig, (ax1,ax2,ax3,ax4) = plt.subplots(1,4, figsize=(30,5))
sns.scatterplot(x='Num of Followers', y='Num of Profile Likes', data=df_train, ax=ax1)
sns.scatterplot(x='Num of Direct Messages', y='Num of Profile Likes', data=df_train, ax=ax2)
sns.scatterplot(x='Avg Daily Profile Visit Duration in seconds', y='Num of Profile Likes', data=df_train, ax=ax3)
sns.scatterplot(x='Avg Daily Profile Clicks', y='Num of Profile Likes', data=df_train, ax=ax4)
plt.show()

In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(30,5))
sns.scatterplot(x='Num of People Following', y='Num of Profile Likes', data=df_train, ax=ax1)
sns.scatterplot(x='UTC Offset', y='Num of Profile Likes', data=df_train, ax=ax2)
sns.scatterplot(x='Num of Status Updates', y='Num of Profile Likes', data=df_train, ax=ax3)
plt.show()

Looking at the scatter plot above, it seems like there is no obvious relationship between the features and the target variables. However, we notice that the values of our target variables are condensed below 200000 likes. We can thus use that as a threshold for the outliers. We also notice that we can remove some outliers regarding the other features.

In [None]:
df_train = df_train.drop(df_train[df_train['Num of Profile Likes']>200000].index)
df_train = df_train.drop(df_train[df_train['Num of Profile Likes']<1].index)

Let's see how many observations we have left.

In [None]:
df_train.shape[0]

In [None]:
#Print a boxplot for our target variable
plt.figure(figsize=(10,5))
sns.boxplot(x=df_train['Num of Profile Likes'])
plt.show()

In [None]:
sns.distplot(df_train['Num of Profile Likes'], bins=10, kde=True)
plt.ylabel('Count')
plt.title('Profile Likes Distribution')
plt.show()

Our distribution already looks better. However, it is still heavily skewed to the right. Let's apply log transformation to reduce skewness of the data. 

In [None]:
#Log transform the target variable
df_train['Num of Profile Likes'] = np.log1p(df_train['Num of Profile Likes'])

In [None]:
sns.distplot(df_train['Num of Profile Likes'], bins=30, kde=True)
plt.ylabel('Count')
plt.title('Profile Likes Distribution')
plt.show()

In [None]:
#Print a boxplot for our target variable after log transforming
plt.figure(figsize=(10,5))
sns.boxplot(x=df_train['Num of Profile Likes'])
plt.show()

Let's now look at our independent variables (features).

In [None]:
df_train.isnull().sum()

For columns that contain a lot of NaN values, we will replace their values by 1 when present and 0 otherwise.

In [None]:
df_train['Personal URL'] = df_train['Personal URL'].notnull().astype(int)
df_train['Location'] = df_train['Location'].notnull().astype(int)
df_test['Personal URL'] = df_test['Personal URL'].notnull().astype(int)
df_test['Location'] = df_test['Location'].notnull().astype(int)

In [None]:
df_train.isnull().sum()

In [None]:
#Turn column Profile Cover Image Status into binary variable.
def update_profile_image_status(X):
    X.loc[X['Profile Cover Image Status'] == "Set", 'Profile Cover Image Status'] = 1
    X.loc[X['Profile Cover Image Status'] == "Not set", 'Profile Cover Image Status'] = 0
    X['Profile Cover Image Status'] = X['Profile Cover Image Status'].fillna(0)
    return X

df_train = update_profile_image_status(df_train)
df_test = update_profile_image_status(df_test)

For non numerical features, let's see how many unique values they each contain.

In [None]:
print("Number of unique values for Profile Text Color", df_train['Profile Text Color'].value_counts().nunique())
print("Number of unique values for Profile Page Color", df_train['Profile Page Color'].value_counts().nunique())
print("Number of unique values for Profile Theme Color", df_train['Profile Theme Color'].value_counts().nunique())
print("Number of unique values for UTC Offset", df_train['UTC Offset'].value_counts().nunique())
print("Number of unique values for User Time Zone", df_train['User Time Zone'].value_counts().nunique())

Considering we have a lot of different unique values in each of the features above, we can use frequency encoding for these variables. However, they also contain some NaN values that we will need to fill the missing values. We will fill in the missing values with the most frequent elements. 

In [None]:
categorical_features_to_impute = ['Profile Text Color', 'Profile Page Color', 'Profile Theme Color', 'UTC Offset', 'User Time Zone']
df_train[categorical_features_to_impute] = df_train[categorical_features_to_impute].fillna(df_train[categorical_features_to_impute].mode().iloc[0])
df_test[categorical_features_to_impute] = df_test[categorical_features_to_impute].fillna(df_test[categorical_features_to_impute].mode().iloc[0])

In [None]:
df_train.isnull().sum()

Now that we have replaced NaNs with the most frequent element in each column. We will encode those categorical variables with their frequencies.

In [None]:
#Define function for frequency encoding
def encode_frequency_feature(X, feature):
    encode = X.groupby(feature).size() / len(X)
    X[feature] = X[feature].apply(lambda x: encode[x])
    return X

In [None]:
categorical_features_to_encode = categorical_features_to_impute
for idx,feature in enumerate(categorical_features_to_encode):
    df_train = encode_frequency_feature(df_train, feature)
    df_test = encode_frequency_feature(df_test, feature)

We have encoded our categorical features using their frequency.

In [None]:
df_train[categorical_features_to_encode]

Let's see what is left for us to do.

In [None]:
df_train.isnull().sum()

We have some numerical features that contain NaNs. We could either fill those NaNs with the mean of the column or the median. In order to determine which is more appropriate, let's see if there are outliers for eahc column using boxplots.

In [None]:
sns.boxplot(x='variable', y='value', data=pd.melt(df_train[['Avg Daily Profile Visit Duration in seconds', 'Avg Daily Profile Clicks']]))
plt.show()

We notice that the 'Avg Daily Profile Visit Duration in seconds' variable does not present any outliers. We can thus replace the missing values with the mean of the column.

In [None]:
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
column_to_impute = np.array(df_train['Avg Daily Profile Visit Duration in seconds']).reshape(-1,1)
mean_imputer.fit(column_to_impute)
df_train['Avg Daily Profile Visit Duration in seconds'] = mean_imputer.transform(column_to_impute)

In [None]:
test_column_to_impute = np.array(df_test['Avg Daily Profile Visit Duration in seconds']).reshape(-1,1)
mean_imputer.fit(test_column_to_impute)
df_test['Avg Daily Profile Visit Duration in seconds'] = mean_imputer.transform(test_column_to_impute)

We don't have any missing values anymore for'Avg Daily Profile Visit Duration in seconds' feature.

In [None]:
df_train['Avg Daily Profile Visit Duration in seconds'].isnull().sum()

Now, what to do with feature 'Avg Daily Profile Clicks' which has many outliers? Since there are many of them, we can impute missing values with the median since the mean is very sensitive to outliers.

In [None]:
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
column_to_impute = np.array(df_train['Avg Daily Profile Clicks']).reshape(-1,1)
median_imputer = median_imputer.fit(column_to_impute)
df_train['Avg Daily Profile Clicks'] = median_imputer.transform(column_to_impute)

In [None]:
test_column_to_impute = np.array(df_test['Avg Daily Profile Clicks']).reshape(-1,1)
median_imputer = median_imputer.fit(test_column_to_impute)
df_test['Avg Daily Profile Clicks'] = median_imputer.transform(test_column_to_impute)

In [None]:
df_train['Avg Daily Profile Clicks'].isnull().sum()

In [None]:
df_train.info()

We still have a few objects features. Let's see what to do with them.

In [None]:
df_train[['Id', 'User Name', 'Profile Verification Status', 'User Language', 'Location Public Visibility', 'Profile Creation Timestamp', 'Profile Category', 'Profile Image']].head()

In [None]:
df_train = df_train.drop(columns=['Id', 'User Name'])
df_test = df_test.drop(columns=['User Name'])

df_train.head()

Let's continue exploring our categorical features.

In [None]:
df_train['Profile Verification Status'].value_counts()

Considering 'Pending' status is not yet verified, we can merge pending and not verified together and turn our feature into a binary feature.

In [None]:
df_train['Profile Verification Status'].unique()

In [None]:
df_train['Profile Verification Status'] = np.where(df_train['Profile Verification Status'] == 'Verified', 1, 0)
df_test['Profile Verification Status'] = np.where(df_test['Profile Verification Status'] == 'Verified', 1, 0)

In [None]:
df_train['Profile Verification Status'].unique()

In [None]:
df_train.head()

We convert our boolean feature to int.

In [None]:
df_train['Is Profile View Size Customized?'] = df_train['Is Profile View Size Customized?'].astype(int)
df_test['Is Profile View Size Customized?'] = df_test['Is Profile View Size Customized?'].astype(int)

In [None]:
df_train['Location Public Visibility'].value_counts()

Let's combine some of the values in this feature.

In [None]:
df_train.loc[df_train['Location Public Visibility'] == '??', 'Location Public Visibility'] = 'unknown'
df_train['Location Public Visibility'] = df_train['Location Public Visibility'].str.lower()
df_test.loc[df_test['Location Public Visibility'] == '??', 'Location Public Visibility'] = 'unknown'
df_test['Location Public Visibility'] = df_test['Location Public Visibility'].str.lower()

In [None]:
df_train['Location Public Visibility'].unique()

In [None]:
df_train = encode_frequency_feature(df_train, 'Location Public Visibility')
df_test = encode_frequency_feature(df_test, 'Location Public Visibility')

In [None]:
df_train['Location Public Visibility'].value_counts()

In [None]:
df_train.info()

Let's study User Language.

In [None]:
df_train['User Language'].value_counts()

In [None]:
sns.boxplot(x=df_train['User Language'].value_counts())
plt.show()

Merge english and british english.

In [None]:
df_train.loc[df_train['User Language'] == 'en-gb', 'User Language'] = 'en'
df_train.loc[df_train['User Language'] == 'zh-cn', 'User Language'] = 'zh'
df_train.loc[df_train['User Language'] == 'zh-TW', 'User Language'] = 'zh'
df_test.loc[df_test['User Language'] == 'en-gb', 'User Language'] = 'en'
df_test.loc[df_test['User Language'] == 'zh-cn', 'User Language'] = 'zh'
df_test.loc[df_test['User Language'] == 'zh-TW', 'User Language'] = 'zh'

In [None]:
df_train['User Language'].value_counts()

We now use frequency encoding to encode our feature.

In [None]:
df_train = encode_frequency_feature(df_train, 'User Language')
df_test = encode_frequency_feature(df_test, 'User Language')

In [None]:
df_train.info()

Let's drop the profile image.

In [None]:
df_test.columns

In [None]:
df_train = df_train.drop(columns=['Profile Image'])
df_test = df_test.drop(columns=['Profile Image'])

In [None]:
df_train['Profile Category'].unique()

In [None]:
df_train.loc[df_train['Profile Category'] == ' ', 'Profile Category'] = 'unknown'
df_test.loc[df_test['Profile Category'] == ' ', 'Profile Category'] = 'unknown'

In [None]:
df_train['Profile Category'].unique()

We know encode by frequency the feature 'Profile Category'

In [None]:
df_train = encode_frequency_feature(df_train, 'Profile Category')
df_test = encode_frequency_feature(df_test, 'Profile Category')

In [None]:
df_train['Profile Category'].unique()

In [None]:
df_train.info()

In [None]:
def transform_timestamp(X):
    #Get Year and Month information
    df_year = X['Profile Creation Timestamp'].str.split(" ", expand=True)[5]
    df_month = X['Profile Creation Timestamp'].str.split(" ", expand=True)[1]  
    #Map month string to numerical value
    months = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
    df_month = df_month.map(months)
    #Subtract months
    df_duration = df_year.apply(lambda x: (2020-int(x))*24)
    df_duration = df_duration - df_month
    #Rename column, add to df_train and drop timestamp
    df_duration = df_duration.astype(int)
    df_duration.rename('Months Since Profile Creation')
    X['Months Since Profile Creation'] = df_duration
    X = X.drop(columns='Profile Creation Timestamp', axis=1)
    return X

In [None]:
df_train = transform_timestamp(df_train)
df_test = transform_timestamp(df_test)

In [None]:
df_train.head(5)

In [None]:
df_test.head(5)

In [None]:
#Check which features ar ehighly correlated with the target variable
correlation_matrix = df_train.corr()[['Num of Profile Likes']].sort_values('Num of Profile Likes')
correlation_matrix

In [None]:
#Plot correlation coefficients
plt.figure(figsize=(12,10))
sns.heatmap(df_train.drop('Num of Profile Likes',axis=1).corr(), vmax=0.6, square=True, annot=True)
plt.show()

Now that we have cleaned our data, let's see where we are at regarding outliers.

In [None]:
first_batch = ['Num of Followers', 'Num of People Following', 'Num of Status Updates', 'Num of Direct Messages', 'Avg Daily Profile Visit Duration in seconds', 'Avg Daily Profile Clicks', 'Months Since Profile Creation']

In [None]:
plt.figure(figsize=(20,15))
sns.boxplot(data=df_train[first_batch])
plt.show()

We thus see that some features contain a high number of outliers. We need to deal with some of them. 

In [None]:
print(df_train[first_batch].skew())
df_train[first_batch].hist(bins=15, figsize=(15, 6), layout=(2, 4))
plt.show()

In [None]:
print(df_test[first_batch].skew())
df_test[first_batch].hist(bins=15, figsize=(15, 6), layout=(2, 4))
plt.show()

We see that 'Avg Daily Profile Visit Duration in seconds' is more or less normally distributed. Same goes for 'Months since profile creation' We therefore do not need to do much about it. The other features are however heavily skewed and we will need to deal with them.

In [None]:
first_batch.remove('Avg Daily Profile Visit Duration in seconds')
first_batch.remove('Months Since Profile Creation')

Let's complete a log transform of our skewed variables to see if we can minimize the skewness.

In [None]:
df_train[first_batch] = np.log1p(df_train[first_batch])
df_test[first_batch] = np.log1p(df_test[first_batch])

TRAIN after log transform

In [None]:
print(df_train[first_batch].skew())
df_train[first_batch].hist(bins=15, figsize=(15, 6), layout=(2, 4))
plt.show()

In [None]:
print(df_test[first_batch].skew())
df_test[first_batch].hist(bins=15, figsize=(15, 6), layout=(2, 4))
plt.show()

In [None]:
plt.figure(figsize=(20,15))
sns.boxplot(data=df_train[first_batch])
plt.show()

In [None]:
df_train.to_csv("/kaggle/working/preprocessed_train_data.csv", index=False)
df_test.to_csv("/kaggle/working/preprocessed_test_data.csv", index=False)