## Feature selection with chi^2

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
import pandas as pd
import numpy as np

In [3]:
def build_unique_ngrams(df, n):
    """
    Count the number of ngrams
    """
    unique_ngrams = df['sentence'].apply(lambda x: x.split()).explode().unique()
    
    print(len(unique_ngrams))
    
    return unique_ngrams

In [4]:
df = pd.read_pickle("dataframes/dataframe_train_0_8_google.pickle") 

In [5]:
vocabulary = build_unique_ngrams(df, 1)

66129


In [6]:
len(vocabulary)

66129

In [12]:
# Compute the chi2 score, using the same vocabulary as provided by the dataframe
vectorizer = CountVectorizer(lowercase=True,stop_words='english', vocabulary=vocabulary)
X = vectorizer.fit_transform(df.sentence)
chi2score = chi2(X, df.label)[0]

In [18]:
np.argmax(chi2score)

5

In [13]:
len(vectorizer.get_feature_names())

66129

In [14]:
df_chi2 = pd.DataFrame({"word": vectorizer.get_feature_names(), "chi2": chi2score})

In [19]:
df_chi2[df_chi2.chi2.isna()]

Unnamed: 0,word,chi2
5,get,
21,n,
39,would,
53,never,
88,u,
...,...,...
65300,count'em,
65722,young'n,
65806,my,
66099,myself,


In [20]:
df_chi2.to_pickle("dataframes/chi2_google_words.pickle")