In [2]:
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from gensim.models import FastText
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings('ignore')

In [3]:
numerical_cols = np.load("numerical_cols.npy")


In [4]:
boolean_cols = np.load("boolean_cols.npy")


In [5]:
string_cols = np.load("string_cols.npy")


### Function that concatenates all features

In [6]:
import json
def get_concateated_data(dataset):
    final_data = []
    for col in dataset.columns:
        if col in string_cols:
            feature_values = []
            float_array = dataset[col].apply(lambda x: np.array([float(y) for y in json.loads(x)]))
            for u in np.array(float_array):
                feature_values.append(u)
            feature_values = np.array(feature_values)
        else:
            feature_values = np.array(dataset[col].astype('float'))
            feature_values = feature_values.reshape(feature_values.shape[0],1)
        if final_data == []:
            final_data = feature_values
            continue
        final_data = np.concatenate((feature_values,final_data),axis=1)
    return final_data

### Fill NAN values with Zeros

In [7]:
def get_train_test(tr_data,te_data):
    final_data_train = get_concateated_data(tr_data)
    final_data_test = get_concateated_data(te_data)
    train = pd.DataFrame(final_data_train).fillna(0)
    test = pd.DataFrame(final_data_test).fillna(0)
    return train,test

### Random Forrest Classifier

In [8]:
def randomForrest(train,test,y_tr,y_te):
    clf = RandomForestClassifier(random_state=1).fit(train, y_tr)
    y_predictions = clf.predict(test)
    return accuracy_score(y_te, y_predictions), clf

### MLP Classifier

In [9]:
def mlpClassifier(train,test,y_tr,y_te):
    clf = MLPClassifier(random_state=1).fit(train, y_tr)
    y_predictions = clf.predict(test)
    return accuracy_score(y_te, y_predictions),clf

### Linear SVC Classifier

In [10]:
def linearSVC(train,test,y_tr,y_te):
    clf = LinearSVC(random_state=1).fit(train,y_tr)
    y_predictions = clf.predict(test)
    return accuracy_score(y_te, y_predictions),clf

### Gives accuracies with certain feature values

In [15]:
def getPredictions(feature_array,training_data,testing_data):
    tr,te = get_train_test(training_data[feature_array],testing_data[feature_array])
    arf,crf = randomForrest(tr,te,y_train,y_test)
    amlp,cmlp = mlpClassifier(tr,te,y_train,y_test)
    alsvc,clsvc = linearSVC(tr,te,y_train,y_test)
    max_acc = max(arf,amlp,alsvc)
    return arf,amlp,alsvc,max_acc,clf

## FastText Model

In [12]:
testing_data1 = pd.read_csv("testing_data.csv")
training_data1 = pd.read_csv("training_data.csv")
numerical_cols = [col for col in numerical_cols if col in training_data1.columns]
boolean_cols = [col for col in boolean_cols if col in training_data1.columns]
string_cols = [col for col in string_cols if col in training_data1.columns]
y_test = pd.read_csv("testing_labels.csv")
y_train = pd.read_csv("training_labels.csv")

In [13]:
training_data1

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,allow_live_comments,archived,author_premium,contest_mode,hide_score,is_original_content,is_reddit_media_domain,is_self,...,name,permalink,post_hint,selftext,selftext_html,suggested_sort,thumbnail,title,url,whitelist_status
0,0,0,0,0,0,0,0,0,0,0,...,"[-0.08767293393611908, 0.06483177095651627, 0....","[-0.5231090188026428, 0.3330666720867157, 1.65...","[1.7087401151657104, -0.06878066062927246, 3.4...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-2.19721699655056, 2.9813412189483643, 0.8320...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.46074700355529785, 0.5137950778007507, 1.5..."
1,816,816,0,0,1,0,0,1,0,1,...,"[0.039095230400562286, 0.1311691701412201, 0.6...","[-0.17063544690608978, 0.14400853216648102, 1....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.402417061564819, 0.5132709581971469, 2.251...","[-0.402417061564819, 0.5132709581971469, 2.251...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7681823968887329, 0.24478691816329956, 2.59...","[-0.026447908021509647, -0.1435493528842926, 2...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.46074700355529785, 0.5137950778007507, 1.5..."
2,790,790,0,1,0,0,0,0,0,0,...,"[-0.0628524050116539, -0.1159270703792572, 0.7...","[-0.10745223611593246, 0.16195541620254517, 1....","[1.7087401151657104, -0.06878066062927246, 3.4...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.2746192685195378, 0.1194040487919535, 2.054...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.46074700355529785, 0.5137950778007507, 1.5..."
3,361,361,0,1,0,0,0,0,0,0,...,"[-0.23540818691253662, 0.23972070217132568, 0....","[-0.6323097348213196, 0.24111028015613556, 1.9...","[1.7087401151657104, -0.06878066062927246, 3.4...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.3447682423436123, 0.8245480005507884, 2.66...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.46074700355529785, 0.5137950778007507, 1.5..."
4,592,592,0,1,1,0,0,0,0,1,...,"[0.06889311224222183, 0.1713256984949112, 0.57...","[0.23157209157943726, 0.4416689872741699, 1.70...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.10075758044656954, 1.4794218228443672, 2.4...","[-0.07414699979126453, 1.452458548732102, 2.47...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7681823968887329, 0.24478691816329956, 2.59...","[0.43894933847089607, 2.1174640680352845, 2.64...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.46074700355529785, 0.5137950778007507, 1.5..."
5,1675,1675,0,0,0,0,0,0,0,1,...,"[-0.005389569792896509, -0.00810469314455986, ...","[-0.21516478061676025, 0.3880104422569275, 1.3...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.3526962054435347, 1.7091469383325244, 2.38...","[-1.3526962054435347, 1.7091469383325244, 2.38...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7681823968887329, 0.24478691816329956, 2.59...","[-0.40110114429678234, 1.5370868955339705, 1.8...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.46074700355529785, 0.5137950778007507, 1.5..."
6,115,115,0,1,0,0,0,0,0,1,...,"[-0.145392045378685, 0.3732094466686249, 0.471...","[-0.19626905024051666, 0.019445592537522316, 1...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.3108515583855265, 1.3623875778387575, 1.28...","[-1.3108515583855265, 1.3623875778387575, 1.28...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7681823968887329, 0.24478691816329956, 2.59...","[-0.6650661354263624, 1.1312040587266285, 1.96...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.46074700355529785, 0.5137950778007507, 1.5..."
7,1079,1079,0,0,1,0,0,0,0,1,...,"[-0.11287950724363327, 0.05164718255400658, 0....","[-0.13953864574432373, 0.19317224621772766, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.45954645883389994, 1.1586029698189937, 1.4...","[-0.45954645883389994, 1.1586029698189937, 1.4...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7681823968887329, 0.24478691816329956, 2.59...","[-0.4645153069868684, 0.9428823404014111, 1.80...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.46074700355529785, 0.5137950778007507, 1.5..."
8,546,546,0,1,0,0,0,0,0,1,...,"[-0.05340205878019333, 0.04660188779234886, 0....","[0.07423229515552521, 0.3756207823753357, 1.03...","[0.7681823968887329, 0.24478691816329956, 2.59...","[-0.3089694377034903, 1.5763922203332186, 1.76...","[-0.31234214222058654, 1.6914165900088847, 1.7...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7681823968887329, 0.24478691816329956, 2.59...","[-0.3239305022704814, 1.460248197295836, 0.347...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.46074700355529785, 0.5137950778007507, 1.5..."
9,336,336,0,1,0,0,0,0,0,1,...,"[-0.00810090359300375, -0.017016511410474777, ...","[-0.2556076645851135, 0.5564022660255432, 1.14...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.44612920390708105, 0.7134798467159271, 1.9...","[-0.44612920390708105, 0.7134798467159271, 1.9...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.7681823968887329, 0.24478691816329956, 2.59...","[-1.3860225373258193, 2.668668615321318, 0.567...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.46074700355529785, 0.5137950778007507, 1.5..."


## Word2Vec Model

In [12]:
# testing_data2 = pd.read_csv("testing_data2.csv")
# training_data2 = pd.read_csv("training_data2.csv")
# numerical_cols = [col for col in numerical_cols if col in training_data2.columns]
# boolean_cols = [col for col in boolean_cols if col in training_data2.columns]
# string_cols = [col for col in string_cols if col in training_data2.columns]
# y_test = pd.read_csv("testing_labels2.csv")
# y_train = pd.read_csv("training_labels2.csv")

In [13]:
# features_max_acc = {}
# for col in training_data1.columns:
#     arf,amlp,alsvc,max_acc,clf = getPredictions([col],training_data1,testing_data1)
#     features_max_acc[col] = max_acc

In [14]:
# def get_top_n_features(n):
#     features = []
#     i=1
#     for k,v in sorted(features_max_acc.items(), key=lambda item: item[1], reverse=True):
#         if i<=n:
#             features.append(k) 
#         i+=1
#     return features

In [15]:
# accuracies = {}
# for i in range(1,len(training_data1.columns)):
#     feat = get_top_n_features(i)
#     arf,amlp,alsvc,max_acc,clf = getPredictions(feat,training_data1,testing_data1)
#     accuracies[i] = max_acc

In [16]:
getPredictions(training_data1.columns,training_data1,testing_data1)

MemoryError: could not allocate 52428800 bytes

In [None]:
getPredictions(training_data2.columns,training_data2,testing_data2)