In [6]:
import re, pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [5]:
data_path = '/home/ibrahim/git_repos/sentiment-analysis/data/training.1600000.processed.noemoticon.csv'

In [8]:
# cleaning tweets with regexp
def clean_tweet(t):
    t = t.lower()
    t = re.sub(r"http\S+|www\S+", " URL ", t)
    t = re.sub(r"@\w+", " USER ", t)
    t = re.sub(r"#(\w+)", r"\1", t)
    t = re.sub(r"[^a-z0-9' ]+", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

In [23]:
# load raw data
def load_sentiments(path):
    cols = ["label","id","date","query","user","tweet"]
    df = pd.read_csv(path, encoding="latin-1", names=cols)
    df["label"] = df["label"].map({0:0, 2:1, 4:2})
    df["tweet"] = df["tweet"].astype(str).map(clean_tweet)
    return df[["tweet","label"]].dropna()

In [24]:
def split(df):
    return train_test_split(df["tweet"], df["label"], test_size=0.2, random_state=33, stratify=df["label"])

In [None]:
# building baseline model
import mlflow, joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report

df = load_sentiments(data_path)
X_train, X_test, y_train, y_test = split(df)

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.9)),
    ("clf", LogisticRegression(max_iter=200, class_weight="balanced", n_jobs=None)),
])

with mlflow.start_run():
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    f1 = f1_score(y_test, y_pred, average="macro")
    mlflow.log_metric("f1_macro", f1)
    mlflow.log_text(classification_report(y_test, y_pred), "cls_report.txt")
    joblib.dump(pipe, "../models/model.joblib")
    mlflow.log_artifact("../models/model.joblib")
print("Saved models/model.joblib")