# Pre-Processing


Final Imputing and Dropping

[Training set, Validation Set, and Final Test](#Training-set,-Validation-Set,-and-Final-Test)

[Numeric Scaling](#Numeric-Scaling)
 - Note the Count Vectorization takes place in the Modelling Notebook

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv(r'/Users/michelstahli/Spring 2022 (Bootcamp+)/Jupyter Notebook CSVs/df_post_eda.csv',  index_col=0)
df.head()

Unnamed: 0,bigotry,directed,sentiment_comment,sentiment_topic,overall_sentiment_score,ups,dal,word_count,up_low_ratio,text_lemmatized,disgust,surprise,neutral,anger,sad,happy,fear
1,False,True,neutral,neutral,0.688598,13,8.38,16,0.045455,"['yeah', 'man', 'back', 'then', 'o', 'is', 'di...",0.015762,0.026123,0.003467,0.01755,0.032248,0.024791,0.040254
2,False,True,neutral,neutral,-0.41342,8,11.38,38,0.035211,"['gt', 'i', 'just', 'extended', 'for', 'a', 'y...",0.005102,0.035714,0.005102,0.02551,0.045918,0.005102,0.045918
3,False,True,neutral,neutral,-0.880108,2,8.18,12,0.035088,"['the', 'hell', 'that', 'many', 'people', 'get...",0.007692,0.010889,0.0001,0.068831,0.027073,0.015085,0.013686
4,False,True,positive,positive,3.160947,2,10.09,41,0.043269,"['worked', 'perfectly', 'im', 'glad', 'to', 'h...",0.00127,0.017361,0.000836,0.007215,0.016621,0.018059,0.013498
5,False,True,positive,positive,-1.269243,2,7.04,10,0.044444,"['this', 'is', 'so', 'bullshit', 'why', 'doe',...",2e-06,4.6e-05,0.000736,4.2e-05,4.6e-05,0.000553,1.3e-05


There were 515 missing rows for the emotion sensor scores. 
I decided to fill with `0`. No missing emotion sensor scores coincided with bigotry == True. 

#### Impute zero for missing emotion scores, drop unnecessary comments, rename the emotion words.

In [4]:
df_proc_onlyb = df
df_proc_onlyb[['disgust', 'surprise', 'neutral', 'anger', 'sad',
       'happy', 'fear']] = df_proc_onlyb[['disgust', 'surprise', 'neutral', 'anger', 'sad',
       'happy', 'fear']].fillna(0)

In [5]:
df_proc_onlyb = df_proc_onlyb.drop(columns=['sentiment_comment', 'sentiment_topic', 'overall_sentiment_score', 'directed'])
df_proc_onlyb = df_proc_onlyb.rename(columns={'disgust':'disgust_x', 'surprise':'surprise_x', 'neutral':'neutral_x', 'anger':'anger_x', 'sad':'sad_x',
       'happy':'happy_x', 'fear':'fear_x'})


Had three rows left with missing text - must have been without words. Drop.

In [6]:
print(df_proc_onlyb.shape)
df_proc_onlyb = df_proc_onlyb.dropna()
print(df_proc_onlyb.shape)

(8925, 13)
(8922, 13)


In [7]:
df_proc_onlyb.head()

Unnamed: 0,bigotry,ups,dal,word_count,up_low_ratio,text_lemmatized,disgust_x,surprise_x,neutral_x,anger_x,sad_x,happy_x,fear_x
1,False,13,8.38,16,0.045455,"['yeah', 'man', 'back', 'then', 'o', 'is', 'di...",0.015762,0.026123,0.003467,0.01755,0.032248,0.024791,0.040254
2,False,8,11.38,38,0.035211,"['gt', 'i', 'just', 'extended', 'for', 'a', 'y...",0.005102,0.035714,0.005102,0.02551,0.045918,0.005102,0.045918
3,False,2,8.18,12,0.035088,"['the', 'hell', 'that', 'many', 'people', 'get...",0.007692,0.010889,0.0001,0.068831,0.027073,0.015085,0.013686
4,False,2,10.09,41,0.043269,"['worked', 'perfectly', 'im', 'glad', 'to', 'h...",0.00127,0.017361,0.000836,0.007215,0.016621,0.018059,0.013498
5,False,2,7.04,10,0.044444,"['this', 'is', 'so', 'bullshit', 'why', 'doe',...",2e-06,4.6e-05,0.000736,4.2e-05,4.6e-05,0.000553,1.3e-05


### Training set, Validation Set, and Final Test

In [11]:
X = df_proc_onlyb.drop(columns='bigotry')
y = df_proc_onlyb['bigotry']

X_model, X_test, y_model, y_test = train_test_split(X, y, test_size=0.2, random_state=26, stratify=y) 


In [12]:
X_train, X_val, y_train, y_val = train_test_split(X_model, y_model, test_size=0.2, random_state=26, stratify=y_model)

### Numeric Scaling
Used Normalizer. Could have considered something stronger, like Power Transformer

In [13]:
def scale_num_bigotry (X_train, X_test):
    #takes in X_train, X_test, fits and transforms with MinMaxScaler
    #Returns train and test scaled 
  
    from sklearn.preprocessing import MinMaxScaler
   
    MM_scaler = MinMaxScaler()
    numeric_cols = X_train.select_dtypes(include='number').columns

    X_train_scaled = X_train
    X_test_scaled = X_test

    #Fit only on training 
    MM_scaler.fit(X_train[numeric_cols])
  
    X_train_scaled[numeric_cols] = MM_scaler.transform(X_train[numeric_cols])
    X_test_scaled[numeric_cols] = MM_scaler.transform(X_test[numeric_cols])
    return X_train_scaled, X_test_scaled

In [14]:
X_train_scaled, X_val_scaled = scale_num_bigotry(X_train, X_val)
X_model_scaled, X_test_scaled = scale_num_bigotry(X_model, X_test)

In [15]:
X_train_scaled.head()

Unnamed: 0,ups,dal,word_count,up_low_ratio,text_lemmatized,disgust_x,surprise_x,neutral_x,anger_x,sad_x,happy_x,fear_x
4810,0.020278,0.027289,0.007385,0.019608,"['my', 'question', 'is', 'why', 'the', 'hell',...",0.000159,0.001088,0.021392,0.000437,0.001057,0.001371,0.00107
980,0.020833,0.02367,0.005538,0.0625,"['hello', 'lady', 'why', 'doe', 'it', 'not', '...",0.11694,0.180855,0.050729,0.249121,0.431424,0.139643,0.094614
6323,0.020278,0.211989,0.000615,0.107143,"['le', 'sighhttpiimgurcomgif']",0.0,0.0,0.0,0.0,0.0,0.0,0.0
700,0.020556,0.044095,0.003692,0.0625,"['ha', 'gleaned', 'that', 'from', 'your', 'com...",0.0,0.0,0.0,0.0,0.0,0.0,0.0
9578,0.020278,0.034278,0.022154,0.06875,"['doe', 'it', 'still', 'use', 'underneath', 'f...",0.028498,0.244707,0.027259,0.21197,0.238071,0.258791,0.149525


***Save all as csv's.***

In [16]:
X.to_csv(r'/Users/michelstahli/Spring 2022 (Bootcamp+)/Jupyter Notebook CSVs/X_for_vector.csv')
X_train_scaled.to_csv(r'/Users/michelstahli/Spring 2022 (Bootcamp+)/Jupyter Notebook CSVs/X_train_scaled_ob.csv') 
X_val_scaled.to_csv(r'/Users/michelstahli/Spring 2022 (Bootcamp+)/Jupyter Notebook CSVs/X_val_scaled_ob.csv') 
y_train.to_csv(r'/Users/michelstahli/Spring 2022 (Bootcamp+)/Jupyter Notebook CSVs/y_train_ob.csv') 
y_val.to_csv(r'/Users/michelstahli/Spring 2022 (Bootcamp+)/Jupyter Notebook CSVs/y_val_ob.csv') 
y_model.to_csv(r'/Users/michelstahli/Spring 2022 (Bootcamp+)/Jupyter Notebook CSVs/y_model_ob.csv') 
X_test_scaled.to_csv(r'/Users/michelstahli/Spring 2022 (Bootcamp+)/Jupyter Notebook CSVs/X_test__scaled_ob.csv')
X_model_scaled.to_csv(r'/Users/michelstahli/Spring 2022 (Bootcamp+)/Jupyter Notebook CSVs/X_final_train_scaled_ob.csv') 
y_test.to_csv(r'/Users/michelstahli/Spring 2022 (Bootcamp+)/Jupyter Notebook CSVs/y_test_ob.csv') 