In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score


#dataset is now stord in a pandas dataframe
path = "/content/drive/MyDrive/train-balanced-sarcasm.csv"
df = pd.read_csv(path)

# Remove missing values
df.dropna(subset=['comment'], inplace=True)

# Train Test Split Parameters
comment_train = df['comment']
label_train = df['label']
randoms = 17 # 17 is giving best output
 
# Split data into training and testing 
training_words, testing_words, training_scores, testing_scores = train_test_split(comment_train, label_train, random_state=randoms)

# Using Unigrams and Bigrams, TF-IDF, using (1, 1) or (2, 2) performs worse
# The data set has about 50,000 features/points.
# Min_df 10 seems to be giving best output
stats = TfidfVectorizer(ngram_range=(1, 2), max_features=50000, min_df=10)


# Use built in LR Model
lr_model = LogisticRegression(solver='sag', random_state=randoms)

# Create Pipeline
pipe = Pipeline([('stats', stats), ('lr_model', lr_model)])

# Preprocess
pipe.fit(training_words, training_scores)

# Predict
guess = pipe.predict(testing_words)


# Evaluate 
print("Score")
print(accuracy_score(testing_scores, guess))

!pip install eli5
import eli5 
print(eli5.show_weights(estimator=pipe.named_steps['lr_model'], vec=pipe.named_steps['stats']))
#display_df = eli5.explain_weights_df(estimator=pipe.named_steps['lr_model'], vec=pipe.named_steps['stats'])
#print(display_df.head(10))
#print(display_df.tail(10))



Score
0.7210420508599333
<IPython.core.display.HTML object>
   target        feature    weight
0       1    yes because  8.961626
1       1      obviously  7.492704
2       1        clearly  7.462800
3       1   yeah because  7.123800
4       1        totally  6.794973
5       1        because  6.186662
6       1       how dare  5.843897
7       1  right because  5.398182
8       1     good thing  5.184803
9       1            duh  5.050009
       target      feature    weight
49991       1   not really -3.625312
49992       1          imo -3.810601
49993       1  necessarily -3.829144
49994       1     that but -3.918156
49995       1    right but -3.996097
49996       1    generally -4.068023
49997       1       it but -4.282503
49998       1  fair enough -4.319423
49999       1     true but -4.469713
50000       1         iirc -4.561264
