In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
import time
from sklearn.decomposition import PCA
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import csv

  from numpy.core.umath_tests import inner1d


## Feature Extraction

In [2]:
class FeatureExtraction():
    
    def __init__(self, df):
        self.df = df
    
    def transform(self):
        df = self.df
        #sentiment
        sentiment = pd.Series(df['text']).apply(lambda x: TextBlob(x).sentiment)
        polarity = sentiment.apply(lambda x: x[0])
        subjectivity = sentiment.apply(lambda x: x[1])
        df['polarity']=polarity
        df['subjectivity']=subjectivity

        #verified
        df["user_verified"]=df["user_verified"].astype(int)

        #hashtags
        df["hashtags"].replace(np.nan, "", inplace = True)
        df["num_hashtags"]=df["hashtags"].apply(lambda x : len(x.split(", ")) if x!= "" else 0)
        df['text']=df['text'].apply(lambda x: x.replace('\r',''))

        #length
        df["length"]=df["text"].apply(lambda x : len(TextBlob(x).split(" ")))

        self.transformed_df = df
        pass
    

In [4]:
df = pd.read_csv("data/train.csv")

In [6]:
FE = FeatureExtraction(df)

In [7]:
FE.transform()

In [13]:
features = FE.transformed_df
features.fillna(0, inplace = True)

In [20]:
x = features[["user_verified", "user_statuses_count", "user_followers_count", "user_friends_count","polarity","subjectivity", "num_hashtags"]]

Y = features[["retweet_count"]]

X = x.values

y = Y.values

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Linear SVM

In [21]:
features.shape

(665777, 15)

In [22]:
from sklearn.svm import LinearSVR

In [23]:
svr = LinearSVR()

In [24]:
svr.fit(X_train, y_train)
y_pred_svm = svr.predict(X_test)

  y = column_or_1d(y, warn=True)


In [25]:
X_train.shape

(532621, 7)

In [26]:
metrics.mean_absolute_error(y_test, y_pred_svm)

146.74208821979573

## Random Forest

In [27]:
rf = RandomForestRegressor(n_estimators = 20, random_state = 0)
rf.fit(X_train, y_train)

  


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [28]:
y_pred_rf = rf.predict(X_test)

In [29]:
metrics.mean_absolute_error(y_test, y_pred_rf)

235.31170398690855

## Evaluation

**Loading data**

In [30]:
eval_data = pd.read_csv("data/evaluation.csv")
FE_eval = FeatureExtraction(eval_data)
FE_eval.transform()

In [31]:
trans_eval_data = FE_eval.transformed_df

In [32]:
X_eval = trans_eval_data[["user_verified", "user_statuses_count", "user_followers_count", "user_friends_count","polarity","subjectivity", "num_hashtags"]]

In [33]:
sc = StandardScaler()
X_eval = sc.fit_transform(X_eval.values)

**Evaluating SVM**

In [34]:
y_eval_pred_svr = svr.predict(X_eval)

In [35]:
with open("linear_svm_predictions.txt", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_eval_pred_svr):
        writer.writerow([str(trans_eval_data['id'].iloc[index]) , str(int(prediction))])

**Evaluating Random Forest**

In [36]:
y_eval_pred_rf = rf.predict(X_eval)

In [37]:
with open("random_forest_predictions.txt", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_eval_pred_rf):
        writer.writerow([str(trans_eval_data['id'].iloc[index]) , str(int(prediction))])