In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import re
import nltk

In [None]:
train_path = " "
test_path = " "
save_path = " "

In [None]:
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
train_data.head(3)

In [None]:
train_data.tail(3)

In [2]:

def missing_stats(df):
    stats = pd.DataFrame(df.isnull().sum()).reset_index()
    stats.columns = ['COLUMN NAME','MISSING VALUES']
    stats['TOTAL ROWS'] = df.shape[0]
    stats['% MISSING'] = round((stats['MISSING VALUES']/stats['TOTAL ROWS'])*100, 2)
    return stats

In [None]:
missing_stats(train_data)

In [None]:
train_data.Label.value_counts().plot(kind = 'bar')

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

In [None]:
print(stopwords.words('english'))

In [None]:
chichewa = ['i', 'ine', 'wanga', 'inenso', 'ife', 'athu',
            'athu', 'tokha', 'inu', 'ndinu','iwe ukhoza',
            'wako','wekha','nokha','iye','wake','iyemwini',
            'icho','ndi','zake','lokha','iwo','awo','iwowo',
            'chiyani','amene', 'uyu', 'uyo', 'awa', "ndili", 
            'ndi', 'ali','anali','khalani','akhala','kukhala',
            ' Khalani nawo','wakhala','anali','chitani',
            'amachita','kuchita', 'a', 'an', 'pulogalamu ya',
            'ndi', 'koma', 'ngati', 'kapena', 'chifukwa',
            'monga', 'mpaka', 'pamene', 'wa', 'pa ',' by',
            'chifukwa' 'ndi','pafupi','kutsutsana','pakati',
            'kupyola','nthawi', 'nthawi','kale','pambuyo',
            'pamwamba', 'pansipa', 'kuti', 'kuchokera',
            'mmwamba', 'pansi', 'mu', 'kunja', 'kuyatsa', 
            'kuchoka', 'kutha', 'kachiwiri', 'kupitilira',
            'kenako',' kamodzi','apa','apo','liti','pati',
            'bwanji','onse','aliyense','onse','aliyense', 
            'ochepa', 'zambiri', 'ambiri', 'ena', 'otero', 
            'ayi', 'kapena', 'osati', 'okha', 'eni', 'omwewo', 
            'kotero',' kuposa','nawonso',' kwambiri','angathe',
            'ndidzatero','basi','musatero', 'musachite',
            ' muyenera', 'muyenera kukhala','tsopano', 'sali', 
            'sindinathe','​​sanachite','satero','analibe', 
            'sanatero','sanachite','sindinatero','ayi','si', 
            'ma', 'sizingatheke','mwina','sayenera', 'osowa',
            'osafunikira', 'shan' , 'nenani', 'sayenera', 'sanali', 
            'anapambana', 'sangachite', 'sanakonde', 'sangatero']

In [None]:
#cleaning texts 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

wn = WordNetLemmatizer()
ps = PorterStemmer()


def text_processing(text):
    text = re.sub('[^a-zA-Z]',' ',text)
    text = text.lower()
    text = text.split()
    text = [wn.lemmatize(word) for word in text if not word in chichewa]
    text = ' '.join(text)
    
    return text

In [None]:
nltk.download('wordnet')

In [None]:
train_data['Text'] = train_data['Text'].apply(text_processing)
test_data['Text'] = test_data['Text'].apply(text_processing)

In [None]:
print(train_data.head())

In [None]:
print(test_data.head())

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_data['Text']).toarray()

training = pd.DataFrame(X, columns = vectorizer.get_feature_names())

print(training.shape)


X_test = vectorizer.transform(test_data['Text']).toarray()
test_new = pd.DataFrame(X_test, columns = vectorizer.get_feature_names())

print(test_new.shape)

In [None]:
training.head()

In [None]:
X = training
y = train_data['Label'] 


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SDGClassifier


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
sgd = SDGClassifier(loss = 'hinge',
                    penalty = 'l2',
                    alpha = 1e-3,
                    random_state = 42,
                    max_iter = 5, 
                    tol = None)

In [None]:
sgd.fit(X_train,y_train)


In [None]:
sgd.score(X_train,y_train)

In [None]:
sgd_preds = sgd.predict(X_test)


In [None]:
sgd.score(X_test, y_test)

In [None]:
test_pred = sgd.predict(test_new)

In [None]:
tags = ['POLITICS','SOCIAL','RELIGION', 'LAW/ORDER','SOCIAL ISSUES',
        'HEALTH', 'ECONOMY', 'FARMING', 'SPORTS', 'EDUCATION', 
        'RELATIONSHIPS', 'WILDLIFE/ENVIRONMENT', 'OPINION/ESSAY', 
        'LOCALCHIEFS', 'CULTURE', 'WITCHCRAFT',         
        'MUSIC','TRANSPORT', 'ARTS AND CRAFTS','FLOODING']     

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

print(classification_report(y_test, sgd_pred, target_names = tags))

In [None]:
#preparing submission
submission = pd.DataFrame()
submission['ID'] = test_data['ID']
submission['Label'] = test_pred
submission.to_csv(save_path + 'sgd1.csv',index = False)