In [1]:
import pandas as pd 

In [2]:
data=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

In [3]:
def clean(data):
    data['text']=data['text'].str.lower()
    data['text']=data['text'].str.replace('[^a-zA-Z ]','',regex=True)
    return data

In [4]:
data=clean(data)
test=clean(test)

In [5]:
data['keyword']=data['keyword'].fillna('')
data['location']=data['location'].fillna('')

test['keyword']=test['keyword'].fillna('')
test['location']=test['location'].fillna('')

In [6]:
X=data[['text','location','keyword']]
y=data['target']

In [7]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_text=TfidfVectorizer(ngram_range=(1,2),min_df=2,max_df=0.98)
tfidf_loc=TfidfVectorizer(ngram_range=(1,2),min_df=20)
ohe=OneHotEncoder(handle_unknown='ignore')

In [None]:
preproc=ColumnTransformer(
    transformers=[
        ('text',tfidf_text,'text'),
        ('loc',tfidf_loc,'location'),
        ('kw',ohe,['keyword']),
    ]
    remainder='drop',
    sparse_threshold=1.0)

In [None]:
from scipy.sparse import hstack 

X_text_train=tfidf_text.fit_transform(X_train['text'].fillna(''))
X_loc_train=tfidf_loc.fit_transform(X_train['location'].fillna(''))
X_ohe_train=ohe.fit_transform(X_train[['keyword']])

X_ctrain=hstack([X_text_train,X_ohe_train,X_loc_train],format='csr')

In [None]:
X_text_test=tfidf_text.transform(X_test['text'].fillna(''))
X_loc_test=tfidf_loc.transform(X_test['location'].fillna(''))
X_ohe_test=ohe.transform(X_test[['keyword']])

X_ctest=hstack([X_text_test,X_ohe_test,X_loc_test],format='csr')

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

nbmodel=MultinomialNB()
nbmodel.fit(X_ctrain,y_train)
predict=nbmodel.predict(X_ctest)
f1_score(y_test,predict)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

word_plus_meta=ColumnTransformer([
    ("word",TfidfVectorizer(ngram_range=(1,2),min_df=2,sublinear_tf=True),"text"),
    ("loc",OneHotEncoder(handle_unknown="ignore",min_frequency=10),["location"]),
    ("kw",OneHotEncoder(handle_unknown="ignore"),["keyword"]),],
    verbose_feature_names_out=False)

nb_word=Pipeline([
    ("features",word_plus_meta),
    ("clf",MultinomialNB(alpha=0.6))
])

characters_only=ColumnTransformer([
    ("char",TfidfVectorizer(analyzer="char_wb",ngram_range=(3,5),min_df=3,max_features=60000),"text"),
],verbose_feature_names_out=False)

nb_characters=Pipeline([
    ("features",characters_only),
    ("clf",MultinomialNB(alpha=0.3))
])

word_characters=ColumnTransformer([
    ("word",TfidfVectorizer(ngram_range=(1,2),min_df=2,sublinear_tf=True),"text"),
    ("char",TfidfVectorizer(analyzer="char_wb",ngram_range=(3,5),min_df=3,max_features=40000),"text"),
],verbose_feature_names_out=False)

logreg=Pipeline([
    ("features",word_characters),
    ("clf",LogisticRegression(max_iter=200,n_jobs=-1))
])

In [10]:
ensemble=VotingClassifier(
    estimators=[
        ("nb_word",nb_word),
        ("nb_characters",nb_characters),
        ("logreg",logreg),
    ],
    voting="soft",
    weights=[1,1,1.2],
    n_jobs=-1)

In [11]:
from sklearn.metrics import f1_score

ensemble.fit(X_train,y_train)
predict=ensemble.predict(X_test)
f1_score(y_test,predict)

0.7732893652102226

In [12]:
from scipy.stats import uniform,randint
from sklearn.model_selection import RandomizedSearchCV

param_dist_words={
    "features__word__ngram_range":[(1,1),(1,2)],
    "features__word__min_df":randint(1,5),
    "clf__alpha":uniform(0.1,2.0)}

word_search=RandomizedSearchCV(
    nb_word,
    param_distributions=param_dist_words,
    n_iter=15,
    cv=5,
    scoring="f1",
    n_jobs=-1,
    random_state=42)

word_search.fit(X_train,y_train)
print("Best params:",word_search.best_params_)

Best params: {'clf__alpha': np.float64(0.412037280884873), 'features__word__min_df': 3, 'features__word__ngram_range': (1, 1)}


In [13]:
param_dist_characters={
    "features__char__ngram_range":[(3,5),(4,6)],
    "features__char__min_df":[1,2,3],
    "features__char__max_features":[30000,40000,60000],
    "clf__alpha":uniform(0.05,1.5)}

characters_search=RandomizedSearchCV(
    nb_characters,
    param_distributions=param_dist_characters,
    n_iter=15,
    cv=5,
    scoring="f1",
    n_jobs=-1,
    random_state=42)

characters_search.fit(X_train,y_train)
print("Best params:",characters_search.best_params_)

Best params: {'clf__alpha': np.float64(0.28399178050430396), 'features__char__max_features': 60000, 'features__char__min_df': 3, 'features__char__ngram_range': (4, 6)}


In [14]:
param_dist_logreg={
    "features__word__ngram_range":[(1,1),(1,2)],
    "features__word__min_df":randint(1,6),
    "features__word__max_df":[0.8,0.9,1.0],
    "features__char__ngram_range":[(3,5),(4,6)],
    "features__char__min_df":randint(2,6),
    "features__char__max_features":[20000,40000]}

logreg_search=RandomizedSearchCV(
    logreg,
    param_distributions=param_dist_logreg,
    n_iter=15,
    cv=4,
    scoring="f1",
    n_jobs=-1,
    random_state=42)

logreg_search.fit(X_train,y_train)
print("Best params:",logreg_search.best_params_)

Best params: {'features__char__max_features': 40000, 'features__char__min_df': 3, 'features__char__ngram_range': (4, 6), 'features__word__max_df': 0.9, 'features__word__min_df': 4, 'features__word__ngram_range': (1, 1)}


In [15]:
import numpy as np

word_plus_meta2=ColumnTransformer([
    ("word",TfidfVectorizer(ngram_range=(1,1),min_df=3,sublinear_tf=True),"text"),
    ("loc",OneHotEncoder(handle_unknown="ignore",min_frequency=10),["location"]),
    ("kw",OneHotEncoder(handle_unknown="ignore"),["keyword"]),],
    verbose_feature_names_out=False)

nb_word2=Pipeline([
    ("features",word_plus_meta2),
    ("clf",MultinomialNB(alpha=np.float64(0.412037280884873)))
])

characters_only2=ColumnTransformer([
    ("char",TfidfVectorizer(analyzer="char_wb",ngram_range=(4,6),min_df=3,max_features=60000),"text"),
],verbose_feature_names_out=False)

nb_characters2=Pipeline([
    ("features",characters_only2),
    ("clf",MultinomialNB(alpha=np.float64(0.28399178050430396)))
])

word_characters2=ColumnTransformer([
    ("word",TfidfVectorizer(ngram_range=(1,1),min_df=4,max_df=0.9,sublinear_tf=True),"text"),
    ("char",TfidfVectorizer(analyzer="char_wb",ngram_range=(4,6),min_df=3,max_features=40000),"text"),
],verbose_feature_names_out=False)

logreg2=Pipeline([
    ("features",word_characters2),
    ("clf",LogisticRegression(max_iter=200,n_jobs=-1))
])

In [16]:
ensemble2=VotingClassifier(
    estimators=[
        ("nb_word",nb_word2),
        ("nb_characters",nb_characters2),
        ("logreg",logreg2),
    ],
    voting="soft",
    weights=[1,1,1.2],
    n_jobs=-1)

In [17]:
ensemble2.fit(X_train,y_train)
predict2=ensemble2.predict(X_test)
f1_score(y_test,predict2)

0.7691056910569106

In [18]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predict2)

0.8135259356533159

In [19]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

possible_weights=[(1,1,1),
                  (2,1,1),(1,2,1),(1,1,2),
                  (3,1,1),(1,3,1),(1,1,3),
                  (1,1,0),(1,0,1),(0,1,1),
                  (1,0,0),(0,1,0),(0,0,1)]
cv=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

gs_w=GridSearchCV(
    estimator=ensemble2,
    param_grid={"weights":possible_weights},
    scoring="f1",
    cv=cv,
    n_jobs=-1,
    verbose=0)

gs_w.fit(X_train,y_train)
print("Best weights:",gs_w.best_params_["weights"],"F1:",gs_w.best_score_)

Best weights: (1, 1, 3) F1: 0.7720297555777834


In [24]:
ensemblefinal=VotingClassifier(
    estimators=[
        ("nb_word",nb_word2),
        ("nb_characters",nb_characters2),
        ("logreg",logreg2),
    ],
    voting="soft",
    weights=[1,1,3],
    n_jobs=-1)

In [25]:
ensemblefinal.fit(X_train,y_train)
predictfinal=ensemblefinal.predict(X_test)
f1_score(y_test,predictfinal)

0.7754442649434572

In [26]:
submission=ensemblefinal.predict(test)

In [27]:
df=pd.DataFrame({
    'id':test['id'],
    'target':submission})
df.to_csv('submission.csv', index=False)

In [29]:
print(df.shape)

(3263, 2)
