In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

models used in voting:

In [34]:
BEST_model = Pipeline([
('vectorizer', TfidfVectorizer(ngram_range=(1, 2), stop_words='english', sublinear_tf=True)),
('classifier', LinearSVC(C=1,
loss = 'squared_hinge',
max_iter = 15000,
penalty = 'l1',
dual = 'auto'))
])


best_Multinomial = Pipeline([
('vectorizer', CountVectorizer(ngram_range=(1, 2), stop_words='english', max_features = 9640)),
('classifier', MultinomialNB(alpha = 0.5))
])


best_xgb = Pipeline([
('vectorizer', TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features = 43380)),
('classifier', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', max_depth=6))
])

voting:

In [35]:
voting_clf = VotingClassifier(estimators=[
('multi', best_Multinomial),
('svm', BEST_model),
('xgb', best_xgb)
], voting='hard')

In [40]:
def train_test(train_path, test_path):
    df = pd.read_csv(train_path)
        
    test = pd.read_csv(test_path)
    
    X = df["text"]
    y = df["emotion"]
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=52
    )

    voting_clf.fit(X_train, y_train)
    
    y_pred = voting_clf.predict(test["text"])

    with open("predictions.txt", "w") as f:
        for label in y_pred:
            f.write(label + "\n")

In [42]:
# train_test("train_emotion.csv", "emotions_dataset_new.csv")