In [14]:
import numpy as np 
import pandas as pd 
# from sklearn import preprocessing # feature_extraction, linear_model, model_selection,
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import f1_score


In [2]:
train_df = pd.read_csv("nlp-getting-started/train.csv")
test_df = pd.read_csv("nlp-getting-started/test.csv")

In [3]:
train_sub = train_df['target']

In [4]:
train_df.shape, test_df.shape

((7613, 5), (3263, 4))

In [5]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
# see tweet

train_df["target"] == 0
print(((train_df["target"] == 0) == True).sum() + ((train_df["target"] == 0) == False).sum())

print('Num of fake disaster tweet in train: ', ((train_df["target"] == 0) == True).sum())
print('Num of real disaster tweet in train: ', ((train_df["target"] == 0) == False).sum())

print('Are the more fake disaster tweet than real disaster tweet? ', ((train_df["target"] == 0) == True).sum()
      >= ((train_df["target"] == 0) == False).sum())
print()
print('Example of tweet with fake disaster: ') 
print(train_df[train_df["target"] == 0][["id", "text"]].values[:4]) # нет разницы сначала ставить колнки а потом условие id?
print()
print('Example of tweet with real disaster: ') 
print(train_df[['id', 'text']][train_df["target"] == 1].values[:4]) # нет разницы сначала ставить колнки а потом условие id?

7613
Num of fake disaster tweet in train:  4342
Num of real disaster tweet in train:  3271
Are the more fake disaster tweet than real disaster tweet?  True

Example of tweet with fake disaster: 
[[23 "What's up man?"]
 [24 'I love fruits']
 [25 'Summer is lovely']
 [26 'My car is so fast']]

Example of tweet with real disaster: 
[[1
  'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all']
 [4 'Forest fire near La Ronge Sask. Canada']
 [5
  "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"]
 [6 '13,000 people receive #wildfires evacuation orders in California ']]


In [7]:
# see tweet with keyword
print(train_df[train_df["keyword"].isna() == False][['id', 'keyword', 'text']].values[:4])

[[48 'ablaze' '@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C']
 [49 'ablaze'
  'We always try to bring the heavy. #metal #RT http://t.co/YAo1e0xngw']
 [50 'ablaze'
  '#AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. http://t.co/2nndBGwyEi']
 [52 'ablaze' 'Crying out for more! Set me ablaze']]


In [8]:
# use CountVectorizer for vectorize word in tweet 
count_vectorizer = CountVectorizer()

example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:3])

print('train tweet: ')
print(train_df["text"].values[:3], '\n')
print('feature names from tweet: ')
print(count_vectorizer.get_feature_names(), '\n')
print('count vectorizer: ')
print(example_train_vectors.toarray())

print('\n',example_train_vectors.shape) # number of unique word in this tweet

train tweet: 
['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'
 'Forest fire near La Ronge Sask. Canada'
 "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"] 

feature names from tweet: 
['all', 'allah', 'are', 'asked', 'being', 'by', 'canada', 'deeds', 'earthquake', 'evacuation', 'expected', 'fire', 'forest', 'forgive', 'in', 'la', 'may', 'near', 'no', 'notified', 'of', 'officers', 'or', 'orders', 'other', 'our', 'place', 'reason', 'residents', 'ronge', 'sask', 'shelter', 'the', 'this', 'to', 'us'] 

count vectorizer: 
[[1 1 1 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1]
 [0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]
 [1 0 2 1 1 1 0 0 0 1 1 0 0 0 2 0 0 0 1 1 0 1 1 1 1 0 2 0 1 0 0 2 0 0 1 0]]

 (3, 36)


In [49]:
example_train_vectors.toarray()[0][0]

1

In [9]:
# fit_transform - Learn the vocabulary dictionary and return term-document matrix.
train_vectors = count_vectorizer.fit_transform(train_df["text"])
# transform - Transform documents to document-term matrix.
test_vectors = count_vectorizer.transform(test_df["text"])

# it's mean we learn dictionary on train_df and after use this diсtionary for vectorize test_df

In [11]:
train_vectors.shape, test_vectors.shape

((7613, 21637), (3263, 21637))

In [12]:
seed = 42
np.random.seed(seed)

In [13]:
X, X_test, y, y_test = train_test_split(train_vectors, train_sub, test_size=0.2, 
                                                    random_state=seed)

In [20]:
X.shape, X_test.shape

((6090, 21637), (1523, 21637))

In [25]:
# RidgeClassifier

clf = RidgeClassifier()
scores = cross_val_score(clf, X, y, cv=3, scoring="f1")
print(scores)
clf.fit(X, y);

preds = clf.predict(X_test)
# accuracy test
print(f1_score(y_test, preds))

preds_test = clf.predict(test_vectors)
preds_df = pd.DataFrame(test_df['id'])
preds_df['target'] = preds_test
preds_df.to_csv("nlp-getting-started/submission_get_started.csv", index=False)

[0.71271394 0.72379778 0.71869328]
0.741753821399839


In [24]:
# LogisticRegression

model = LogisticRegression(solver='newton-cg')
scores_m = cross_val_score(model, X, y, cv=3, scoring="f1")
print(scores_m)
model.fit(X, y)

preds_m = model.predict(X_test)
# accuracy test
print(f1_score(y_test, preds_m))

preds_test_m = model.predict(test_vectors)
preds_df_m = pd.DataFrame(test_df['id'])
preds_df_m['target'] = preds_test_m
preds_df_m.to_csv("nlp-getting-started/submission_get_started-2.csv", index=False)

[0.73416149 0.74722565 0.72424429]
0.7646103896103896


In [27]:
X.shape

(6090, 21637)

In [36]:
X[0][0]

<1x21637 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [55]:
y.iloc[0]

1

In [56]:
# use KFold on LogisticRegression

result = pd.DataFrame()
kf = KFold(n_splits=5, shuffle=True)
for num, idxs in enumerate(kf.split(X)):
    print(f'Fold № {num}')
    train_index = idxs[0]
    val_index = idxs[1]
#     print(train_index, train_index.shape)
#     print(val_index, val_index.shape)
    X_train, X_val = X.toarray()[train_index], X.toarray()[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    model = LogisticRegression()
    model.fit(X_train, y_train)
    dummy = {'Fold №': int(num), 'Accuracy': model.score(X_val, y_val)}
    result = result.append(dummy, ignore_index=True) 

Fold № 0
[   0    1    2 ... 6087 6088 6089] (4872,)
[  10   14   22 ... 6076 6082 6083] (1218,)
Fold № 1
[   0    1    4 ... 6087 6088 6089] (4872,)
[   2    3    5 ... 6079 6081 6086] (1218,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fold № 2
[   0    2    3 ... 6087 6088 6089] (4872,)
[   1   15   18 ... 6063 6077 6085] (1218,)
Fold № 3
[   1    2    3 ... 6085 6086 6087] (4872,)
[   0    6   11 ... 6075 6088 6089] (1218,)
Fold № 4
[   0    1    2 ... 6086 6088 6089] (4872,)
[   4   12   26 ... 6080 6084 6087] (1218,)


In [57]:
result['Fold №'] = result['Fold №'].astype(int)
result

Unnamed: 0,Accuracy,Fold №
0,0.810345,0
1,0.781609,1
2,0.801314,2
3,0.789819,3
4,0.807061,4


In [58]:
# preds after fit model with KFold.split
preds = model.predict(X_test)
# accuracy test
print(f1_score(y_test, preds))

0.7573770491803279
