In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from gensim.models import FastText
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings('ignore')

In [2]:
testing_data = pd.read_csv("testing_data.csv")
training_data = pd.read_csv("training_data.csv")
training_data = training_data.drop(['Unnamed: 0'],axis=1)
testing_data = testing_data.drop(['Unnamed: 0'],axis=1)

In [3]:
y_test = pd.read_csv("testing_labels.csv")
y_train = pd.read_csv("training_labels.csv")
y_train = y_train.drop(["Unnamed: 0"],axis=1)
y_test = y_test.drop(["Unnamed: 0"],axis=1)

In [4]:
import json
def get_concateated_data(dataset):
    final_data = []
    for col in dataset.columns:
        if dataset[col].dtype == 'object':
            feature_values = []
            float_array = dataset[col].apply(lambda x: np.array([float(y) for y in json.loads(x)]))
            for u in np.array(float_array):
                feature_values.append(u)
            feature_values = np.array(feature_values)
        else:
            feature_values = np.array(dataset[col].astype('float'))
            feature_values = feature_values.reshape(feature_values.shape[0],1)
        if final_data == []:
            final_data = feature_values
            continue
        final_data = np.concatenate((feature_values,final_data),axis=1)
    return final_data

In [5]:
def get_train_test(tr_data,te_data):
    final_data_train = get_concateated_data(tr_data)
    final_data_test = get_concateated_data(te_data)
    train = pd.DataFrame(final_data_train).fillna(0)
    test = pd.DataFrame(final_data_test).fillna(0)
    return train,test

In [6]:
def randomForrest(train,test,y_tr,y_te):
    clf = RandomForestClassifier(random_state=1).fit(train, y_tr)
    y_predictions = clf.predict(test)
    return accuracy_score(y_te, y_predictions), clf

In [7]:
def mlpClassifier(train,test,y_tr,y_te):
    clf = MLPClassifier(random_state=1).fit(train, y_tr)
    y_predictions = clf.predict(test)
    return accuracy_score(y_te, y_predictions),clf

In [8]:
def linearSVC(train,test,y_tr,y_te):
    clf = LinearSVC(random_state=1).fit(train,y_tr)
    y_predictions = clf.predict(test)
    return accuracy_score(y_te, y_predictions),clf

In [9]:
def getPredictions(feature_array):
    tr,te = get_train_test(training_data[feature_array],testing_data[feature_array])
    arf,crf = randomForrest(tr,te,y_train,y_test)
    amlp,cmlp = mlpClassifier(tr,te,y_train,y_test)
    alsvc,clsvc = linearSVC(tr,te,y_train,y_test)
    return arf,amlp,alsvc

In [10]:
getPredictions([
 'selftext',
 'comments',
 'title',
 'url',
])

(0.39, 0.445, 0.44)

In [11]:
boolean_cols = ['allow_live_comments',
 'archived',
 'author_premium',
 'contest_mode',
 'hide_score',
 'is_original_content',
 'is_reddit_media_domain',
 'is_self',
 'is_video',
 'locked',
 'no_follow',
 'send_replies',
 'spoiler',
 'stickied']

In [12]:
getPredictions(boolean_cols)

(0.245, 0.2325, 0.235)

In [13]:
numerical_cols = [col for col in training_data.columns if np.issubdtype(training_data[col].dtype, np.number) and col not in boolean_cols]

In [14]:
getPredictions(numerical_cols)

(0.78, 0.7525, 0.5775)

In [16]:
all_cols = training_data.columns

In [17]:
getPredictions(all_cols)

(0.7925, 0.775, 0.72)

In [18]:
for col in all_cols:
    print(col)
    print(getPredictions([col]))

allow_live_comments
(0.1025, 0.1025, 0.1025)
archived
(0.1875, 0.1875, 0.1875)
author_premium
(0.12, 0.12, 0.12)
contest_mode
(0.09, 0.09, 0.09)
hide_score
(0.09, 0.09, 0.09)
is_original_content
(0.0875, 0.0875, 0.0875)
is_reddit_media_domain
(0.1125, 0.1125, 0.1125)
is_self
(0.1525, 0.1525, 0.1525)
is_video
(0.09, 0.09, 0.09)
locked
(0.09, 0.09, 0.09)
no_follow
(0.105, 0.105, 0.105)
send_replies
(0.1175, 0.12, 0.1175)
spoiler
(0.09, 0.09, 0.09)
stickied
(0.09, 0.09, 0.09)
created
(0.205, 0.225, 0.2075)
gilded
(0.1075, 0.1075, 0.105)
num_comments
(0.1375, 0.1325, 0.1375)
num_crossposts
(0.12, 0.12, 0.1075)
num_duplicates
(0.11, 0.1125, 0.1125)
score
(0.11, 0.13, 0.135)
subreddit_subscribers
(0.62, 0.6225, 0.4925)
thumbnail_height
(0.1525, 0.15, 0.135)
thumbnail_width
(0.15, 0.15, 0.15)
total_awards_received
(0.115, 0.12, 0.12)
upvote_ratio
(0.1375, 0.155, 0.1475)
selftext
(0.225, 0.2525, 0.25)
comments
(0.2525, 0.31, 0.3125)
title
(0.3125, 0.43, 0.3725)
url
(0.09, 0.09, 0.09)


In [20]:
getPredictions(['created',
 'gilded',
 'num_comments',
 'num_crossposts',
 'num_duplicates',
 'score',
 'thumbnail_height',
 'thumbnail_width',
 'total_awards_received',
 'upvote_ratio'])

(0.365, 0.3125, 0.2875)

In [25]:
string_cols = [c for c in all_cols if c not in numerical_cols and c not in boolean_cols]