## Baseline model - Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /Users/mika/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mika/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
data = pd.read_csv('../data/processed/sarcasm_headlines_processed.csv')

In [25]:
#clean data
X_clean = data['headline'].replace('[^ -~]+', '', regex=True)
X_clean = X_clean.str.replace('[^\w\s]','', regex=True)
X_clean = X_clean.replace('_', '', regex=True)
X_clean = X_clean.replace('\d+', '', regex=True) 

In [26]:
#lemmatizer
lemmatizer = WordNetLemmatizer()
X_clean = X_clean.apply(lambda X: ' '.join([lemmatizer.lemmatize(i) for i in X.split()]))

In [27]:
X_clean

0        thirtysomething scientist unveil doomsday cloc...
1        dem rep totally nail why congress is falling s...
2             eat your veggie deliciously different recipe
3        inclement weather prevents liar from getting t...
4        mother come pretty close to using word streami...
                               ...                        
55323                  american politics in moral freefall
55324                                    america best hike
55325                                 reparation and obama
55326    israeli ban targeting boycott supporter raise ...
55327                          gourmet gift for the foodie
Name: headline, Length: 55328, dtype: object

In [28]:
#stopwords
stop = stopwords.words('english')
X_clean_fin = X_clean.apply(lambda x: ' '.join([w for w in x.split() if w not in (stop)]))

In [29]:
X_clean_fin

0        thirtysomething scientist unveil doomsday cloc...
1        dem rep totally nail congress falling short ge...
2                  eat veggie deliciously different recipe
3             inclement weather prevents liar getting work
4        mother come pretty close using word streaming ...
                               ...                        
55323                     american politics moral freefall
55324                                    america best hike
55325                                     reparation obama
55326    israeli ban targeting boycott supporter raise ...
55327                                  gourmet gift foodie
Name: headline, Length: 55328, dtype: object

In [30]:
#vectorization
vectorizer = TfidfVectorizer(ngram_range=(1,1))
X_vect = vectorizer.fit_transform(X_clean_fin)

In [31]:
#normalization
X = normalize(X_vect)

In [32]:
#data split
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X, data['is_sarcastic'], test_size = 0.4, random_state=42)

In [33]:
baseline_model = LogisticRegression()
baseline_model.fit(X_train_lr, y_train_lr)

In [34]:
y_pred_lr = baseline_model.predict(X_test_lr)
accuracy_score(y_test_lr, y_pred_lr)

0.8433489969275257

In [35]:
f1_score(y_test_lr, y_pred_lr)

0.8232114629544642

In [36]:
confusion_matrix(y_test_lr, y_pred_lr)

array([[10593,  1334],
       [ 2133,  8072]])

In [37]:
#example on both sarcastic and non-sarcastic
ex = ['clinton becomes first president to clear 18 feet in pole vault', 'year-round schooling: how it would help minority students']
ex_vect = vectorizer.transform(ex)
baseline_model.predict(ex_vect)

array([1, 0])