# Learners

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import nltk
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from nltk.stem.snowball import SnowballStemmer

english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
    
    

train_file = "data\\train.csv"
test_file = "data\\test_text.csv"

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)


X, X_valid, y, y_valid = train_test_split(train_df.text, train_df.subreddit, test_size=0.001, random_state=42)


In [3]:
tfidf_transformer = StemmedTfidfVectorizer()
tfidf_transformer.fit(list(train_df.text) + list(test_df.text))

X_tfidf = tfidf_transformer.transform(X)
X_test_tfidf = tfidf_transformer.transform(test_df.text)
X_valid_tfidf = tfidf_transformer.transform(X_valid)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# model = LinearSVC()
model = MultinomialNB(fit_prior=False, alpha=1e-1)
model.fit(X_tfidf, y)
# Predict the categories of the test data
predicted_categories = model.predict(X_test_tfidf)
predicted_categories_valid = model.predict(X_valid_tfidf)

In [5]:
from sklearn.metrics import classification_report, f1_score

f1_score(y_valid, predicted_categories_valid, average='micro')

0.7333333333333333

In [6]:
test_df["subreddit"] = predicted_categories

In [7]:
test_df["id"] = test_df.index

In [8]:
submission_df = test_df.drop(columns=["text"])

In [9]:
submission_df.to_csv("submission_2.csv", index=False)

# Ensemble

In [15]:

dfs = [pd.read_csv(f"submission_{i}.csv") for i in range(1, 5)]


In [21]:
from collections import Counter

df_ensemble = dfs[0].copy()

for i, row in df_ensemble.iterrows():
    subreddits = Counter([df.loc[i, "subreddit"] for df in dfs])
    df_ensemble.loc[i, "subreddit"] = subreddits.most_common(1)[0][0]
    

In [24]:
df_ensemble.to_csv("submission_ensemble.csv", index=False)