In [2]:
import numpy as np
import pprint
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, make_scorer, precision_score, recall_score
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import advertools as adv
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score,  cross_val_predict
from sklearn.svm import LinearSVC
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler, SMOTE
import langdetect
pp = pprint.PrettyPrinter(indent=4, sort_dicts=False)

#### Loading the data into dataframes and doing light preprocessing 

In [3]:
def preprocess(data):
    data.text = data.text.apply(lambda x: x.lower().rstrip())
    data.text.replace(r'\s+|\\n', ' ', regex=True, inplace = True)
    # data.text.replace(r'^\s*$', np.nan,regex=True, inplace=True)
    # data.dropna(inplace=True)
    # data.reset_index(drop=True, inplace=True)
    return data


train_data = pd.read_csv("data/train_data.csv")
train_data = preprocess(train_data)
print(train_data.head())


test_data = pd.read_csv("data/test_data.csv")
test_data = preprocess(test_data)
print(test_data.head())



  language                                               text    label
0    dansk   dette er et fremragende initiativ, og jeg stø...  Ireland
1    dansk   hr. formand, jeg er sikker på, at alle her er...  Ireland
2    dansk   hr. formand, folk på den nordlige halvkugle t...  England
3    dansk   hr. formand, med forbehold af nogle få ændrin...  England
4    dansk   - hr. formand, jeg må protestere mod den lemf...  England
                                                text
0   hr. formand, selv om vi i høj grad sympatiser...
1   quiero dejar constancia de mi apoyo a este in...
2   . – el comercio ilegal de riñones humanos se ...
3   signor presidente, per introdurre una nota di...
4   jeg stemte for meddelelsen af decharge til fæ...


In [4]:
train_data = train_data[pd.notnull(train_data["text"])]
train_data["category_id"] = train_data["label"].astype("category")
train_data["category_id"] = train_data["category_id"].cat.codes
train_data["lang_id"] = train_data["language"].astype("category")
train_data["lang_id"] = train_data["lang_id"].cat.codes
train_data.head()

lang_id_df = train_data[['language', 'lang_id']].drop_duplicates().sort_values('lang_id')
lang_to_id = dict(lang_id_df.values)
id_to_lang = dict(lang_id_df[['lang_id', 'language']].values)

category_id_df = train_data[['label', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'label']].values)


In [6]:
detect_langs_dict

{'de': 0, 'nl': 1, 'da': 2, 'es': 3, 'it': 4}

In [73]:
detect_langs_list = ["de", "nl", "da", "es", "it"]
detect_langs_dict = {key: value for key, value in zip(detect_langs_list, id_to_lang.keys())}
langdetect.DetectorFactory.seed = 21

def language_detector(row):
    try:
        langs = langdetect.detect_langs(row["text"])
        for lang in langs:
            lang = lang.lang
            if lang in detect_langs_list:
                return detect_langs_dict[lang]
            else:
                print(row)
                return 2
    except:
        print(row)
        return 0

test_data["lang_id"] = test_data.apply(lambda row: language_detector(row), axis=1)
test_data.loc[~test_data["lang_id"].isin([0,1,2,3,4])]


text     jeg sagde managua, jeg sagde ikke projektstedet.
Name: 6666, dtype: object
text     dat was zo' n uitgebreid antwoord dat ik geen...
Name: 7210, dtype: object
text    
Name: 7582, dtype: object
text     hr. formand, til forretningsordenen. må jeg o...
Name: 10684, dtype: object
text     jeg opponerer imod kinas etbarnspolitik og an...
Name: 13164, dtype: object
text     fru formand, en bemærkning til forretningsord...
Name: 13419, dtype: object


Unnamed: 0,text,lang_id


In [74]:
import advertools as adv
from sklearn.feature_extraction.text import TfidfVectorizer

stop_words = []
for key in ["danish", "german", "dutch", "italian", "spanish"]:
    stop_words += list(adv.stopwords[key])


tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words=stop_words)


In [75]:
from collections import defaultdict

lang_bows = defaultdict(int)
models = {i: i  for i in range(5)}

for language in id_to_lang.keys():
    lang_bows[language] = tfidf.fit_transform(train_data[train_data.lang_id == language].text)
    models[language] = LinearSVC(max_iter=5000, random_state=21, C=1,  penalty="l1", dual=False, class_weight="balanced").fit(lang_bows[language], train_data[train_data.lang_id == language].category_id)
    test_data.loc[test_data.lang_id == language, "text"] = test_data.loc[test_data.lang_id == language, "text"].apply(lambda x: tfidf.transform([x]).toarray())


In [76]:
def MultiLangPredict(row):
    return models[row["lang_id"]].predict(row["text"])

predicted_labels = []
for index, row in test_data.iterrows():
    predicted_labels.append(MultiLangPredict(row))
predicted_labels = [predicted_labels[i][0] for i in range(len(predicted_labels))]


In [77]:
# # Random Undersampling
# us = RandomUnderSampler(random_state=21)
# features, labels = us.fit_resample(features, labels)

# # Random Oversampling
# os = RandomOverSampler(random_state=21)
# features, labels= os.fit_resample(features, labels)

# # SMOTE
# os = SMOTE(random_state=21)
# features, labels= os.fit_resample(features, labels)

In [78]:
final_data = {"id": test_data.index+1, "label": predicted_labels}

submission = pd.DataFrame(data=final_data).set_index("id")
submission = submission.label.apply(lambda x: id_to_category[x])
submission.to_csv("submissions/submission_SVM_lang_detection.csv")
submission.head()

id
1     England
2     Ireland
3     England
4     England
5    Scotland
Name: label, dtype: object