In [199]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [200]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [201]:
data_path = '../input/how-to-be-instagram-famous-with-data-science/'
df = pd.read_csv(data_path+'dev.csv')


In [202]:
print('Shape of dataset: ', df.shape)
#preview the dataset
print(df.head())

In [203]:
#summary of dataset
df.info()

In [204]:
#view statistical properties of all features
df.describe(include='all')

In [205]:
#finds total number of missing values in dataset
df.isnull().sum()
# Therefore, no need for median imputation to fill NA values as there are none

In [206]:
#converting categorical data into numerical data
def day(n):
    d = {'MONDAY':1, 'TUESDAY':2, 'WEDNESDAY':3, 'THURSDAY':4, 'FRIDAY':5, 'SATURDAY':6, 'SUNDAY':7}
    return d[n]
df['day_posted'] = df['day_posted'].apply(day)
df['current_day'] = df['current_day'].apply(day)

def checker(n):
    d = {'yes':1, 'no':0}
    return d[n]
df['has_new_comments'] = df['has_new_comments'].apply(checker)
df

In [207]:
X = df.iloc[:,:-2]
Y = df.iloc[:,-2]
col_x = list(df.columns)[:-2]
col_y = list(df.columns)[-2]

In [208]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

In [209]:
X_train, X_val, Y_train, Y_val = train_test_split(X,Y,test_size=0.2, random_state=12)
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, Y_train)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
y_pred_test = rf.predict(X_val)
print("Accuracy Score of random forest with all features: ", accuracy_score(Y_val, y_pred_test))

In [210]:
#finding important features
importances = rf.feature_importances_
print(importances)

In [211]:
plt.barh(col_x, importances)

In [212]:
#features with feature_importances_ > 0.1 are selected
#important features are 'delta_48_24h' and 'no_commments_24h'
col_x_imp = ['no_comments_24h','delta_48_24h']
X_imp = X.loc[:, ['no_comments_24h', 'delta_48_24h']]
type(X_imp)

In [213]:
#applying random forest using Rapids only on the important features
import cuml, cudf, cuml
from cuml.model_selection import train_test_split
from cuml.ensemble import RandomForestClassifier
#converting to rapids dataframe
# X_imp = cudf.from_pandas(X_imp)
# Y_imp = cudf.from_pandas(Y)
X_imp = cudf.from_pandas(X_imp)
Y_imp = cudf.from_pandas(Y)
#convertin int64 to float32
X_imp_train, X_imp_val, Y_imp_train, Y_imp_val = train_test_split(X_imp,Y_imp,test_size=0.2)
X_imp_train = (X_imp_train.values).astype('float32')
Y_imp_train = (Y_imp_train.values).astype('float32')
X_imp_val = (X_imp_val.values).astype('float32')
Y_imp_val = (Y_imp_val.values).astype('float32')
cuRF = RandomForestClassifier()
cuRF.fit(X_imp_train, Y_imp_train)

In [214]:
#evaluations metrics for cuRF
predictions = cuRF.predict(X_imp_val)
cu_score = cuml.metrics.accuracy_score(Y_imp_val, predictions)
sk_score = accuracy_score(Y_val, y_pred_test)
print('Random Forest Classification')
print('CUML accuracy score: ', cu_score)
print('SKlearn accuracy score: ', sk_score)

In [215]:
#testing data
df2 = pd.read_csv(data_path+'comp.csv')
df2.head()

In [216]:
df2.isnull().sum()

In [217]:
X_test = df2.loc[:, ['no_comments_24h', 'delta_48_24h']]
X_test.head()

In [218]:
train_X = df.loc[:, ['no_comments_24h', 'delta_48_24h']]
train_Y = df.iloc[:,-2]
train_X = cudf.from_pandas(train_X)
train_Y = cudf.from_pandas(train_Y)
train_X = train_X.astype('float32')
train_Y = train_Y.astype('float32')
forest = RandomForestClassifier()
forest.fit(train_X, train_Y)


In [219]:
Y_test = forest.predict(X_test)
Y_test = Y_test.astype('int64')
Y_test

In [220]:
def mapper(n):
    d = {'1':'yes', '0':'no'}
    return d[str(n)]
Y_test = pd.DataFrame(Y_test, columns=['has_new_comments',])
Y_test
Y_test['has_new_comments'] = Y_test['has_new_comments'].apply(mapper)
Y_test
# id_ = df2.iloc[:,-1]
# id_

In [221]:
id_ = df2.iloc[:,-1]
id_

In [222]:
id_ = id_.values
Y_test = Y_test.values

id_ = np.reshape(id_,(1,-1))
Y_test = np.reshape(Y_test,(1,-1))
Y_test = Y_test[0]
id_ = id_[0]
sub = []
for i in range(len(id_)):
    sub2 = []
    sub2.append(id_[i])
    sub2.append(Y_test[i])
    sub.append(sub2)
sub = pd.DataFrame(sub, columns = ['ID', 'has_new_comments'])
sub.to_csv('submission.csv', index = False)