# This notebook contains all classes used in the project, as well as an example of a complete execution 

* Given a training dataset, fit a classifier, then fit a regressor for every class.
* Given the testing dataset, classify sample then perform regression using the appropriate regressor.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
import time
from sklearn.decomposition import PCA
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVR
from sklearn import metrics
import csv
import sys

In [4]:
features = ["user_verified", "user_statuses_count", "user_followers_count", "user_friends_count","polarity","subjectivity", "num_hashtags", "num_mentions"]

## 1. Feature extraction

In [5]:
class FeatureExtraction():
    
    def __init__(self, df):
        self.df = df
    
    def transform(self):
        df = self.df
        #sentiment
        sentiment = pd.Series(df['text']).apply(lambda x: TextBlob(x).sentiment)
        polarity = sentiment.apply(lambda x: x[0])
        subjectivity = sentiment.apply(lambda x: x[1])
        df['polarity']=polarity
        df['subjectivity']=subjectivity

        #verified
        df["user_verified"]=df["user_verified"].astype(int)

        #hashtags
        df["hashtags"].replace(np.nan, "", inplace = True)
        df["num_hashtags"]=df["hashtags"].apply(lambda x : len(x.split(", ")) if x!= "" else 0)
        df['text']=df['text'].apply(lambda x: x.replace('\r',''))

        #length
        df["length"]=df["text"].apply(lambda x : len(TextBlob(x).split(" ")))
        
        #num_mentions
        df["user_mentions"].replace(np.nan, "", inplace = True)
        df["num_mentions"]=df["user_mentions"].apply(lambda x : len(x.split(", ")) if x!= "" else 0)

        #fillna
        df.fillna(0,inplace = True)
        
        self.transformed_df = df
        
        pass
    

In [6]:
class Classification():
    
    def label(count):
        if count == 0:
            return 0
        else:
            if count<=10:
                return 1
            else:
                if count<=100:
                    return 2
                else:
                    return 3 
                
    def __init__(self, df):
        self.df = df
        self.df["class"] = self.df["retweet_count"].apply(Classification.label)
         
                
    def classify(self,features):
        
        x = self.df[features]

        Y = self.df[["class"]]

        X = x.values

        y = Y.values

        #X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

        sc_class = StandardScaler()
        X_train = sc_class.fit_transform(X)
        #X_test = sc_class.transform(X_test)
        
        self.scaler = sc_class
        
        #some classifier
        rf = RandomForestClassifier(n_estimators = 100, random_state = 0, criterion = "entropy")
        
        rf.fit(X, y)
        
        self.classifier = rf
        
        pass
 

In [7]:
class Regression():
    
    def __init__(self, df):
        self.df = df
        
    def regression_per_class(self, features):
    #returns a regressor for every class
        regressors=[]
        scalers=[]
        by_class = self.df.groupby(["class"])
        for _,group in by_class:
            X_r = group[features]
            Y_r = group[['retweet_count']]
            y_r = Y_r.values

            #X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_r, Y_r, test_size=0.2, random_state=0)
            sc = StandardScaler()
            X_train_r = sc.fit_transform(X_r)
            #X_test_r = sc.transform(X_test_r)
            

            svr = LinearSVR()
            svr.fit(X_train_r, y_r)
            regressors += [svr]
            scalers+=[sc]
            
        self.regressors = regressors
        self.scalers = scalers
        

In [8]:
df = pd.read_csv('data/train.csv')

In [9]:
FE = FeatureExtraction(df)

**Instead of transforming the data (takes a lot of time), we import the already-calculated features (for details check the FeatureExtraction class).**

In [10]:
features_df = pd.read_csv("data/features_v1.csv")

In [11]:
Classifier = Classification(features_df)

In [12]:
Classifier.classify(features)



In [13]:
Regressor = Regression(Classifier.df)

In [14]:
Regressor.regression_per_class(features)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


## Putting it together to predict on eval data

In [15]:
eval_data = pd.read_csv("data/evaluation.csv")

In [16]:
def pred_reg_class(test_df, classifier, regressors, features, scalers_reg, scaler_class):
    #we suppose that test_df already has the extracted features
    #classify then do regression
    y_pred = pd.DataFrame(np.zeros(len(test_df)), columns = ["pred"])
    
    #classification
    X_class = test_df[features].values
    X_class = scaler_class.transform(X_class)
    y_class = classifier.predict(X_class)
    test_df["class"]=y_class

    #regression
    for c in range(4):
        if (test_df.loc[test_df["class"]==c].shape[0]!=0):
            X_ = test_df.loc[test_df["class"]==c][features].values
            X_ = scalers_reg[c].transform(X_)
            y_ = regressors[c].predict(X_)
            y_pred.loc[test_df["class"]==c,"pred"]=y_
        
    return y_pred

In [17]:
FE_eval = FeatureExtraction(eval_data)

**Same as before, to save time we import already transformed evaluation data.**

In [18]:
#FE_eval.transform()
#eval_ = FE_eval.transformed_df

In [19]:
eval_ = pd.read_csv("data/trans_eval_data.csv")

In [20]:
y_pred_rc = pred_reg_class(eval_, Classifier.classifier, Regressor.regressors, features, Regressor.scalers, Classifier.scaler)

In [21]:
y_pred_rc.describe()

Unnamed: 0,pred
count,285334.0
mean,0.435766
std,1.218527
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,86.386002


In [22]:
with open("classif_regr_2_predictions.txt", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_pred_rc.values):
        writer.writerow([str(eval_['id'].iloc[index]) , str(prediction[0])])

In [None]:
eval_.to_csv("data/trans_eval_data.csv")