In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split

## Получени и анализ данных

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [19]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [8]:
train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [18]:
train.keyword.unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [45]:
train[pd.isnull(train['keyword'])].head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [131]:
train.text[150]

"@mickinyman @TheAtlantic That or they might be killed in an airplane accident in the night a car wreck! Politics at it's best."

In [215]:
train.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

## Предобработка

In [5]:
copy = train.copy()

In [6]:
copy.drop(columns=['location'], axis = 1, inplace = True)

In [7]:
stop_words = stopwords.words('english')
url = 'http'
lemmatizer = WordNetLemmatizer()
stop_words.remove('not')
stop_words.append('us')

In [8]:
def data_preprocessing(twit):
    twit = re.sub(re.compile('[.:<>#*/,%;$!?^№()""0-9=+]'), ' ', str(twit))
    twit = re.sub('[^A-Za-z]+', ' ', twit)
    
    twit = twit.lower()
    
    tokens = twit.split()
    
    twit = []
    for word in tokens:
        if word not in stop_words and url not in word and '@' not in word:
            twit.append(word)
    twit = [lemmatizer.lemmatize(word) for word in twit]
    
    twit = ' '.join(twit)
    
    return twit

In [11]:
copy['text_pre'] = copy['text'].apply(lambda x: data_preprocessing(x))

In [12]:
copy.text_pre[0]

'deed reason earthquake may allah forgive'

In [13]:
copy.head()

Unnamed: 0,id,keyword,text,target,key_pre,text_pre
0,1,,Our Deeds are the Reason of this #earthquake M...,1,,deed reason earthquake may allah forgive
1,4,,Forest fire near La Ronge Sask. Canada,1,,forest fire near la ronge sask canada
2,5,,All residents asked to 'shelter in place' are ...,1,,resident asked shelter place notified officer ...
3,6,,"13,000 people receive #wildfires evacuation or...",1,,people receive wildfire evacuation order calif...
4,7,,Just got sent this photo from Ruby #Alaska as ...,1,,got sent photo ruby alaska smoke wildfire pour...


## Обучение

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

X = copy.text_pre
y = copy.target

xtrain, xtest, ytrain, ytest = train_test_split(X, y, stratify=y)

tf = TfidfVectorizer(min_df=10)

xtrain_tf = tf.fit_transform(xtrain)
xtest_tf = tf.transform(xtest)

In [17]:
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators = 300, max_depth= 6, learning_rate =  0.01, use_label_encoder=False, n_jobs=-1)

xgb.fit(xtrain_tf, ytrain)

pred = xgb.predict(xtest_tf)

f1_score(pred, ytest)



0.6313645621181263

In [15]:
clf = LogisticRegression(n_jobs=-1)

clf.fit(xtrain_tf, ytrain)

pred = clf.predict(xtest_tf)

f1_score(pred, ytest)

0.7473821989528796

In [220]:
subm = pd.read_csv('sample_submission.csv')
subm.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


Для теста

In [221]:
test['text_pre'] = test['text'].apply(lambda x: data_preprocessing(x))

test_tf = tf.transform(test.text_pre)
ans = clf.predict(test_tf)

In [222]:
subm['target'] = ans
subm.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [224]:
subm.to_csv('answer1.csv', index=False)