In [1]:
import re
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")

In [2]:
######################### To Configure Kaggle #########################
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
######################### Load datasets #########################
path_train = "nlp-getting-started/train.csv"
path_test = "nlp-getting-started/test.csv"
train = pd.read_csv(path_train)
test = pd.read_csv(path_test)
print("-"*20+"test"+"-"*20)
print("shape:",test.shape)
print("duplicates:",test.duplicated().sum())
print(test.head(2))
print()
print()
print("-"*20+"train"+"-"*20)
print("shape:",train.shape)
print("duplicates:",test.duplicated().sum())
print(train.head(2))

--------------------test--------------------
shape: (3263, 4)
duplicates: 0
   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...


--------------------train--------------------
shape: (7613, 5)
duplicates: 0
   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   

   target  
0       1  
1       1  


In [3]:
def handle_mentions(text):
    pattern = re.compile(r"@\w+")
    text = pattern.sub("users",text)
    return text
    
def handle_hashtags(text):
    pattern = re.compile(r"#(\w+)")
    text = pattern.sub(r"\1",text)
    return text
    
def handle_url(text):
    pattern = re.compile(r"https?://\S+|www\.\S+")
    text = pattern.sub("http", text)
    return text
    
def clean_text(text):
    text = text.lower().strip()
    text = handle_url(text)
    text = handle_hashtags(text)
    text = handle_mentions(text)
    return text

def preprocess(df):
    # df["keyword"] = df["keyword"].fillna("keyword_missing")
    # df["clean_text"] = df["keyword"]+" "+df["text"]
    df["clean_text"] = df["text"].apply(clean_text)
    return df


df = preprocess(train.copy())
tfidf = TfidfVectorizer(
    max_features=30000,
    min_df=2,
    max_df=0.95,
    strip_accents= "unicode", # café → cafe
    sublinear_tf= True,
    ngram_range=(1,3),
    stop_words=['http', 'users', 'keyword_missing']
)
vectors_arr = tfidf.fit_transform(df["clean_text"]).toarray()
df_vec = pd.DataFrame(vectors_arr, columns=tfidf.get_feature_names_out())

df_vec["is_keyword"] = df["keyword"].isna().astype(int)

model = RidgeClassifier(alpha=1.0)
model.fit(df_vec.values, df["target"].values)

pred = model.predict(df_vec.values)
f1_score(df["target"].values, pred)

0.9459715639810427

In [4]:
df = preprocess(test.copy())
vectors_arr = tfidf.transform(df["clean_text"]).toarray()
df_vec = pd.DataFrame(vectors_arr, columns=tfidf.get_feature_names_out())
df_vec["is_keyword"] = df["keyword"].isna().astype(int)

pred = model.predict(df_vec.values)
submission = pd.DataFrame()
submission["id"] = test["id"]
submission["target"] = pred
submission.to_csv("submission.csv", index=False)