In [1]:
import pandas as pd

In [2]:
data=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

In [3]:
data.head(15)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [4]:
data['text']=data['text'].str.replace('[^a-zA-Z]','')    
test['text']=test['text'].str.replace('[^a-zA-Z]','')    

In [5]:
data['text']=data['text'].str.lower()
test['text']=test['text'].str.lower()

In [6]:
data['keyword_missing']=data['keyword'].isna().astype('int8')
data['location_missing']=data['location'].isna().astype('int8')

test['keyword_missing']=test['keyword'].isna().astype('int8')
test['location_missing']=test['location'].isna().astype('int8')

In [7]:
data['keyword']=data['keyword'].fillna('missing')
data['location']=data['location'].fillna('')

test['keyword']=test['keyword'].fillna('missing')
test['location']=test['location'].fillna('')

In [8]:
X=data[['text','keyword','location','keyword_missing','location_missing']]
y=data['target'].astype(int)

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_text=TfidfVectorizer(ngram_range=(1,2),min_df=2,max_df=0.98)
tfidf_loc=TfidfVectorizer(ngram_range=(1,2),min_df=20)
ohe=OneHotEncoder(handle_unknown='ignore')

In [10]:
from scipy.sparse import hstack,csr_matrix

X_text=tfidf_text.fit_transform(data['text'])
X_loc=tfidf_loc.fit_transform(data['location'])
X_ohe=ohe.fit_transform(data[['keyword']])
X_flags=csr_matrix(data[['keyword_missing','location_missing']].values)


In [11]:
X_combined=hstack([X_text,X_ohe,X_loc,X_flags],format='csr')

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

X_train,X_test,y_train,y_test=train_test_split(X_combined,y,test_size=0.2,random_state=42)
model=MultinomialNB()
model.fit(X_train,y_train)

In [13]:
from sklearn.metrics import accuracy_score

prediction=model.predict(X_test)
accuracy_score(y_test,prediction)

0.7912015758371634

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_dist={
    'alpha':uniform(0.01,2),
    'fit_prior':[True,False]}

search=RandomizedSearchCV(
    MultinomialNB(),
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='f1',
    random_state=42,
    n_jobs=-1)

search.fit(X_train,y_train)
print("Best params:", search.best_params_)

Best params: {'alpha': np.float64(0.12616722433639893), 'fit_prior': False}


In [15]:
import numpy as np

model_final=MultinomialNB(
    alpha=np.float64(0.12616722433639893),
    fit_prior=False
)

model_final.fit(X_train,y_train)
prediction_final=model_final.predict(X_test)
accuracy_score(y_test,prediction_final)

0.7931713722915299

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model=RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    n_jobs=-1)

rf_model.fit(X_train,y_train)

In [None]:
rf_prediction=rf_model.predict(X_test)
accuracy_score(y_test,rf_prediction)

In [None]:
rf_model.get_params()

In [None]:
from scipy.stats import randint
param_dist={
    'n_estimators':randint(100,500),
    'max_depth':[None]+list(range(10,50,10)),
    'min_samples_split':[2,5,10],
    'min_samples_leaf':[1,2,4],
    'max_features':['sqrt','log2',0.2,0.5,None]
}   
rf=RandomForestClassifier(random_state=42)

rfc_search=RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=15,
    cv=5,
    scoring='f1',
    verbose=2,
    random_state=42,
    n_jobs=-1)
    
rfc_search.fit(X_train,y_train)
print("Best Parameters:",rfc_search.best_params_)

In [None]:
rfc_final=RandomForestClassifier(
    max_depth=40,
    max_features=0.5,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=158)

rfc_final.fit(X_train,y_train)
rfc_prediction=rfc_final.predict(X_test)
accuracy_score(y_test,rfc_prediction)

In [None]:
import xgboost as xgb
xg_reg=xgb.XGBClassifier(objective='binary:logistic')
xg_reg.fit(X_train,y_train)

In [None]:
xg_prediction=xg_reg.predict(X_test)
accuracy_score(y_test,xg_prediction)

In [None]:
xg_reg.get_params()

In [None]:
from scipy.stats import uniform,randint

parameters={
    'max_depth':randint(3,10),
    'learning_rate':uniform(0.01,0.2),
    'n_estimators':randint(100,500),
    'subsample':uniform(0.7,0.3),
    'colsample_bytree':uniform(0.7,0.3),
    'gamma':uniform(0,1),
    'reg_alpha':uniform(0,1),
    'reg_lambda':uniform(1,2)}
Xgb2=xgb.XGBClassifier(eval_metric='logloss')

random_search=RandomizedSearchCV(
    estimator=Xgb2,
    param_distributions=parameters,
    n_iter=50,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1)

random_search.fit(X_train,y_train)
print("Best parameters:",random_search.best_params_)

In [None]:
feature_names=tfidf_text.get_feature_names_out()

class_log_probs=model.feature_log_prob_

n_top20=20
for i, class_label in enumerate(model.classes_):
    top_features=np.argsort(class_log_probs[i])[-n_top20:]
    print(f"\nTop words for class {class_label}:")
    for feat in reversed(top_features):
        print(f"{feature_names[feat]}: {class_log_probs[i][feat]:.4f}")

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

word_plus_meta=ColumnTransformer([
    ("word",TfidfVectorizer(ngram_range=(1,2),min_df=2,sublinear_tf=True),"text"),
    ("loc",OneHotEncoder(handle_unknown="ignore",min_frequency=10),["location"]),
    ("kw",OneHotEncoder(handle_unknown="ignore"),["keyword"]),
],verbose_feature_names_out=False)

nb_word=Pipeline([
    ("features",word_plus_meta),
    ("clf",MultinomialNB(alpha=0.6))
])

characters_only=ColumnTransformer([
    ("char",TfidfVectorizer(analyzer="char_wb",ngram_range=(3,5),min_df=3,max_features=60000),"text"),
],verbose_feature_names_out=False)

nb_characters=Pipeline([
    ("features",characters_only),
    ("clf",MultinomialNB(alpha=0.3))
])

word_characters=ColumnTransformer([
    ("word",TfidfVectorizer(ngram_range=(1,2),min_df=2,sublinear_tf=True),"text"),
    ("char",TfidfVectorizer(analyzer="char_wb",ngram_range=(3,5),min_df=3,max_features=40000),"text"),
],verbose_feature_names_out=False)

logreg=Pipeline([
    ("features",word_characters),
    ("clf",LogisticRegression(max_iter=200,n_jobs=-1))
])

In [17]:
ensemble=VotingClassifier(
    estimators=[
        ("nb_word",nb_word),
        ("nb_characters",nb_characters),
        ("logreg",logreg),
    ],
    voting="soft",
    weights=[1,1,1.2],
    n_jobs=-1)

In [18]:
EX_train,EX_test,Ey_train,Ey_test=train_test_split(X,y,test_size=0.2,random_state=42)

ensemble.fit(EX_train,Ey_train)
ensemble_probability=ensemble.predict_proba(EX_test)[:,1]
ensemble_prediction=(ensemble_probability>=0.5).astype(int)
accuracy_score(Ey_test,ensemble_prediction)

0.8168089297439265

In [20]:
ths1=np.linspace(0.1,0.9,41)
f1s1=[f1_score(Ey_test,(ensemble_probability>=t).astype(int)) for t in ths1]
best_t1=ths1[int(np.argmax(f1s1))]

print("Acc @0.5:",accuracy_score(Ey_test,(ensemble_probability>=0.5).astype(int)))
print("Best F1:",max(f1s1),"at threshold",best_t1)

NameError: name 'f1_score' is not defined

In [21]:
from scipy.stats import uniform,randint
param_dist_words={
    "features__word__ngram_range":[(1,1),(1,2)],
    "features__word__min_df":randint(1,5),
    "clf__alpha":uniform(0.1,2.0)}

word_search=RandomizedSearchCV(
    nb_word,
    param_distributions=param_dist_words,
    n_iter=15,
    cv=5,
    scoring="f1",
    n_jobs=-1,
    random_state=42)

word_search.fit(EX_train,Ey_train)
print("Best params:",word_search.best_params_)

Best params: {'clf__alpha': np.float64(0.412037280884873), 'features__word__min_df': 3, 'features__word__ngram_range': (1, 1)}


In [22]:
param_dist_characters={
    "features__char__ngram_range":[(3,5),(4,6)],
    "features__char__min_df":[1,2,3],
    "features__char__max_features":[30000,40000,60000],
    "clf__alpha":uniform(0.05,1.5)}

characters_search=RandomizedSearchCV(
    nb_characters,
    param_distributions=param_dist_characters,
    n_iter=15,
    cv=5,
    scoring="f1",
    n_jobs=-1,
    random_state=42)

characters_search.fit(EX_train,Ey_train)
print("Best params:",characters_search.best_params_)

Best params: {'clf__alpha': np.float64(0.3227374508106509), 'features__char__max_features': 30000, 'features__char__min_df': 1, 'features__char__ngram_range': (4, 6)}


In [23]:
param_dist_logreg={
    "features__word__ngram_range":[(1,1),(1,2)],
    "features__word__min_df":randint(1,6),
    "features__word__max_df":[0.8,0.9,1.0],
    "features__char__ngram_range":[(3,5),(4,6)],
    "features__char__min_df":randint(2,6),
    "features__char__max_features":[20000,40000]}

logreg_search=RandomizedSearchCV(
    logreg,
    param_distributions=param_dist_logreg,
    n_iter=15,
    cv=4,
    scoring="f1",
    n_jobs=-1,
    random_state=42)

logreg_search.fit(EX_train,Ey_train)
print("Best params:",logreg_search.best_params_)

Best params: {'features__char__max_features': 20000, 'features__char__min_df': 5, 'features__char__ngram_range': (4, 6), 'features__word__max_df': 0.9, 'features__word__min_df': 2, 'features__word__ngram_range': (1, 2)}


In [24]:
word_plus_meta2=ColumnTransformer([
    ("word",TfidfVectorizer(ngram_range=(1,1),min_df=3,sublinear_tf=True),"text"),
    ("loc",OneHotEncoder(handle_unknown="ignore",min_frequency=10),["location"]),
    ("kw",OneHotEncoder(handle_unknown="ignore"),["keyword"]),
],verbose_feature_names_out=False)

nb_word2=Pipeline([
    ("features",word_plus_meta2),
    ("clf",MultinomialNB(alpha=np.float64(0.412037280884873)))
])

characters_only2=ColumnTransformer([
    ("char",TfidfVectorizer(analyzer="char_wb",ngram_range=(4,6),min_df=1,max_features=30000),"text"),
],verbose_feature_names_out=False)

nb_characters2=Pipeline([
    ("features",characters_only2),
    ("clf",MultinomialNB(alpha=np.float64(0.3227374508106509)))
])

word_characters2=ColumnTransformer([
    ("word",TfidfVectorizer(ngram_range=(1,1),min_df=2,max_df=0.9,sublinear_tf=True),"text"),
    ("char",TfidfVectorizer(analyzer="char_wb",ngram_range=(4,6),min_df=5,max_features=20000),"text"),
],verbose_feature_names_out=False)

logreg2=Pipeline([
    ("features",word_characters2),
    ("clf",LogisticRegression(max_iter=200,n_jobs=-1))
])


In [25]:
ensemble2=VotingClassifier(
    estimators=[
        ("nb_word",nb_word2),
        ("nb_characters",nb_characters2),
        ("logreg",logreg2),
    ],
    voting="soft",
    weights=[1,1,1.2],
    n_jobs=-1)

In [26]:
ensemble2.fit(EX_train,Ey_train)
ensemble2_probability=ensemble2.predict_proba(EX_test)[:,1]
ensemble2_prediction=(ensemble2_probability>=0.5).astype(int)
accuracy_score(Ey_test,ensemble2_prediction)

0.8187787261982928

In [27]:
from sklearn.metrics import f1_score

ths=np.linspace(0.1,0.9,41)
f1s=[f1_score(Ey_test,(ensemble2_probability>=t).astype(int)) for t in ths]
best_t=ths[int(np.argmax(f1s))]

print("Acc @0.5:",accuracy_score(Ey_test,(ensemble2_probability>=0.5).astype(int)))
print("Best F1:",max(f1s),"at threshold",best_t)

Acc @0.5: 0.8187787261982928
Best F1: 0.7745098039215687 at threshold 0.5


In [28]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

possible_weights=[(1,1,1),
                  (2,1,1),(1,2,1),(1,1,2),
                  (3,1,1),(1,3,1),(1,1,3),
                  (1,1,0),(1,0,1),(0,1,1),
                  (1,0,0),(0,1,0),(0,0,1)]
cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

gs_w=GridSearchCV(
    estimator=ensemble2,
    param_grid={"weights":possible_weights},
    scoring="f1",
    cv=cv,
    n_jobs=-1,
    verbose=0)

gs_w.fit(EX_train,Ey_train)
print("Best weights:",gs_w.best_params_["weights"],"F1:",gs_w.best_score_)

Best weights: (1, 1, 1) F1: 0.7716763206058224


In [29]:
ensemble3=VotingClassifier(
    estimators=[
        ("nb_word",nb_word2),
        ("nb_characters",nb_characters2),
        ("logreg",logreg2),
    ],
    voting="soft",
    weights=[1,1,3],
    n_jobs=-1)

In [30]:
ensemble3.fit(EX_train,Ey_train)
ensemble3_probability=ensemble3.predict_proba(EX_test)[:,1]
ensemble3_prediction=(ensemble3_probability>=0.5).astype(int)
f1_score(Ey_test,ensemble3_prediction)

0.7757085020242915

In [31]:
submission=ensemble3.predict(test)

In [35]:

df=pd.DataFrame({"Id":test.values,
                 "target":submission})

ValueError: Per-column arrays must each be 1-dimensional

In [36]:
test.head()

Unnamed: 0,id,keyword,location,text,keyword_missing,location_missing
0,0,missing,,just happened a terrible car crash,1,1
1,2,missing,,"heard about #earthquake is different cities, s...",1,1
2,3,missing,,"there is a forest fire at spot pond, geese are...",1,1
3,9,missing,,apocalypse lighting. #spokane #wildfires,1,1
4,11,missing,,typhoon soudelor kills 28 in china and taiwan,1,1
