In [38]:
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

data = pd.read_csv('twitter_sentiments.csv')

print(data.head())

   id  label                                              tweet
0   1      0   @user when a father is dysfunctional and is s...
1   2      0  @user @user thanks for #lyft credit i can't us...
2   3      0                                bihday your majesty
3   4      0  #model   i love u take with u all the time in ...
4   5      0             factsguide: society now    #motivation


In [39]:
data.shape

(31962, 3)

In [40]:
data.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [41]:
train, test = train_test_split(data, test_size = 0.2, stratify = data['label'], random_state=21)
train.shape, test.shape

((25569, 3), (6393, 3))

In [42]:
train.label.value_counts(normalize=True)

0    0.929837
1    0.070163
Name: label, dtype: float64

In [43]:
test.label.value_counts(normalize=True)

0    0.929923
1    0.070077
Name: label, dtype: float64

In [44]:
tfidf_vectorizer = TfidfVectorizer(lowercase= True, max_features=1000, stop_words=ENGLISH_STOP_WORDS)

# fit the object with the training data tweets
tfidf_vectorizer.fit(train.tweet)

TfidfVectorizer(max_features=1000,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}))

In [45]:
# transform the train and test data
train_idf = tfidf_vectorizer.transform(train.tweet)
test_idf  = tfidf_vectorizer.transform(test.tweet)

In [46]:
# LineaRegression Model
model_LR = LogisticRegression()

In [47]:
# fit the model with the training data
model_LR.fit(train_idf, train.label)

LogisticRegression()

In [48]:
# predict the label on the training data
predict_train = model_LR.predict(train_idf)

In [49]:
# predict the model on the test data
predict_test = model_LR.predict(test_idf)

In [50]:
# f1 score on train data
f1_score(y_true = train.label, y_pred = predict_train)

0.48840927258193445

In [51]:
# f1 score on train data
f1_score(y_true = test.label, y_pred = predict_test)

0.46003262642740617

In [53]:
pipeline = Pipeline(steps=[('tfidf', TfidfVectorizer(lowercase=True,
                                                   max_features=1000,
                                                   stop_words = ENGLISH_STOP_WORDS)),
                          ('model', LogisticRegression())])

In [54]:
pipeline.fit(train.tweet, train.label)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1000,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterwards', 'again',
                                                       'against', 'all',
                                                       'almost', 'alone',
                                                       'along', 'already',
                                                       'also', 'although',
                                                       'always', 'am', 'among',
                                                       'amongst', 'amoungst',
                                                       'amount', 'an', 'and',
                                                       'another', 'any',
                                                       'anyhow', 'anyone',
           