# Zadanie 7

## Ładowanie bibliotek oraz ładowanie danych wraz z ich przygotowaniem

In [58]:
import re
import numpy as np
import pandas as pd
import string
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier


def load_text_data_and_clean():  # and w nazwie sugeruje, że należy tę funkcję rozbić + gramatycznie jest "load and clean text data"
    """Funkcja ładująca surowe dane tekstowe z zadania nr 7 z kodowaniem ISO-8859-1 oraz usuwająca zbędne informacje"""
    dictionary = {'TEXT':[], 'LANGUAGE':[]}
    files = ['english.txt', 'german.txt', 'spanish.txt', 'polish.txt', 'finnish.txt', 'italian.txt']
    for filename in files:
        try:
            lang = filename.strip(".txt")
            with open(filename, encoding='iso-8859-1', mode='r', newline='\n') as f:
                for line in f:
                    if line == "\n":
                        continue
                    else:
                        line = line.lower()
                        line = re.sub(r'[\t]+|[\n]+|[\r]+|[0-9]+|[^\w+\s]', '', line)
                        dictionary['TEXT'].append(line)
                        dictionary['LANGUAGE'].append(lang)
        except IOError:
            print(f"Plik {lang} nie został odnalezniony")
    df = pd.DataFrame(dictionary)
    return df

## Podział danych na zbiór treningowy i testowy w stosunku $0.2$, $0.3$, $0.4$

In [59]:
data = load_text_data_and_clean()

train_02, test_02 = train_test_split(data, test_size = 0.2, shuffle=True, random_state=123)
train_03, test_03 = train_test_split(data, test_size = 0.3, shuffle=True, random_state=123)
train_04, test_04 = train_test_split(data, test_size = 0.4, shuffle=True, random_state=123)

## Reprezentujemy tekst przy użyciu n-gramów, wykonujemy transformacje i dopasowujemy MNB

In [60]:
clf_MNB_02_unigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,1))),
     ('tfidf', TfidfTransformer()), # sprawdzał Pan co robi TfidfTransformer?
     ('clf', MultinomialNB()) ])

clf_MNB_02_bigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(2,2))), # a nie dałoby się tego jakąś pętlą załatwić?
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()) ])

clf_MNB_02_trigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(3,3))),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()) ])

clf_MNB_03_unigram = Pipeline([  # identico jak clf_MNB_02_unigram
     ('vect', CountVectorizer(ngram_range=(1,1))),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()) ])

clf_MNB_03_bigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(2,2))),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()) ])

clf_MNB_03_trigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(3,3))),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()) ])

clf_MNB_04_unigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,1))),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()) ])

clf_MNB_04_bigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(2,2))),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()) ])

clf_MNB_04_trigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(3,3))),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()) ])

clf_MNB_02_unigram.fit(train_02['TEXT'], train_02['LANGUAGE'])
clf_MNB_02_bigram.fit(train_02['TEXT'], train_02['LANGUAGE'])
clf_MNB_02_trigram.fit(train_02['TEXT'], train_02['LANGUAGE'])
clf_MNB_03_unigram.fit(train_03['TEXT'], train_03['LANGUAGE'])
clf_MNB_03_bigram.fit(train_03['TEXT'], train_03['LANGUAGE'])
clf_MNB_03_trigram.fit(train_03['TEXT'], train_03['LANGUAGE'])
clf_MNB_04_unigram.fit(train_04['TEXT'], train_04['LANGUAGE'])
clf_MNB_04_bigram.fit(train_04['TEXT'], train_04['LANGUAGE'])
clf_MNB_04_trigram.fit(train_04['TEXT'], train_04['LANGUAGE'])

Pipeline(steps=[('vect', CountVectorizer(ngram_range=(3, 3))),
                ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

## Predykcje dla MNB

In [61]:
predicted_clf_MNB_02_unigram = clf_MNB_02_unigram.predict(test_02['TEXT'])
predicted_clf_MNB_02_bigram = clf_MNB_02_bigram.predict(test_02['TEXT'])
predicted_clf_MNB_02_trigram = clf_MNB_02_trigram.predict(test_02['TEXT'])
predicted_clf_MNB_03_unigram = clf_MNB_03_unigram.predict(test_03['TEXT'])
predicted_clf_MNB_03_bigram = clf_MNB_03_bigram.predict(test_03['TEXT'])
predicted_clf_MNB_03_trigram = clf_MNB_03_trigram.predict(test_03['TEXT'])
predicted_clf_MNB_04_unigram = clf_MNB_04_unigram.predict(test_04['TEXT'])
predicted_clf_MNB_04_bigram = clf_MNB_04_bigram.predict(test_04['TEXT'])
predicted_clf_MNB_04_trigram = clf_MNB_04_trigram.predict(test_04['TEXT'])

## Precision, recall, f1 i accuracy dla MNB

In [62]:
print(metrics.classification_report(test_02['LANGUAGE'], predicted_clf_MNB_02_unigram))

              precision    recall  f1-score   support

     english       1.00      0.99      1.00      6667
     finnish       1.00      1.00      1.00      6255
      german       0.95      1.00      0.98     13710
     italian       1.00      0.83      0.91      2667
      polish       1.00      0.19      0.32       246
     spanish       0.99      1.00      1.00      6252

    accuracy                           0.98     35797
   macro avg       0.99      0.84      0.87     35797
weighted avg       0.98      0.98      0.98     35797



In [1]:
print(metrics.classification_report(test_02['LANGUAGE'], predicted_clf_MNB_02_bigram))

In [2]:
print(metrics.classification_report(test_02['LANGUAGE'], predicted_clf_MNB_02_trigram))

In [3]:
print(metrics.classification_report(test_03['LANGUAGE'], predicted_clf_MNB_03_unigram))

In [4]:
print(metrics.classification_report(test_03['LANGUAGE'], predicted_clf_MNB_03_bigram))

In [5]:
print(metrics.classification_report(test_03['LANGUAGE'], predicted_clf_MNB_03_trigram))

In [6]:
print(metrics.classification_report(test_04['LANGUAGE'], predicted_clf_MNB_04_unigram))

In [7]:
print(metrics.classification_report(test_04['LANGUAGE'], predicted_clf_MNB_04_bigram))

In [8]:
print(metrics.classification_report(test_04['LANGUAGE'], predicted_clf_MNB_04_trigram))

## Reprezentujemy tekst przy użyciu n-gramów, wykonujemy transformacje i dopasowujemy regresję logistyczną

In [9]:
clf_LR_02_unigram = Pipeline([  # zauważam pewne podobieństwo do komórki nieco wyżej - od czegoś są funkcje
     ('vect', CountVectorizer(ngram_range=(1,1))),
     ('tfidf', TfidfTransformer()),
     ('clf', LogisticRegression(solver='newton-cg', max_iter=100)) ])

clf_LR_02_bigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(2,2))),
     ('tfidf', TfidfTransformer()),
     ('clf', LogisticRegression(solver='newton-cg', max_iter=100)) ])

clf_LR_02_trigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(3,3))),
     ('tfidf', TfidfTransformer()),
     ('clf', LogisticRegression(solver='newton-cg', max_iter=100)) ])

clf_LR_03_unigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,1))),
     ('tfidf', TfidfTransformer()),
     ('clf', LogisticRegression(solver='newton-cg', max_iter=100)) ])

clf_LR_03_bigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(2,2))),
     ('tfidf', TfidfTransformer()),
     ('clf', LogisticRegression(solver='newton-cg', max_iter=100)) ])

clf_LR_03_trigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(3,3))),
     ('tfidf', TfidfTransformer()),
     ('clf', LogisticRegression(solver='newton-cg', max_iter=100)) ])

clf_LR_04_unigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,1))),
     ('tfidf', TfidfTransformer()),
     ('clf', LogisticRegression(solver='newton-cg', max_iter=100)) ])

clf_LR_04_bigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(2,2))),
     ('tfidf', TfidfTransformer()),
     ('clf', LogisticRegression(solver='newton-cg', max_iter=100)) ])

clf_LR_04_trigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(3,3))),
     ('tfidf', TfidfTransformer()),
     ('clf', LogisticRegression(solver='newton-cg', max_iter=100)) ])

clf_LR_02_unigram.fit(train_02['TEXT'], train_02['LANGUAGE'])
clf_LR_02_bigram.fit(train_02['TEXT'], train_02['LANGUAGE'])
clf_LR_02_trigram.fit(train_02['TEXT'], train_02['LANGUAGE'])
clf_LR_03_unigram.fit(train_03['TEXT'], train_03['LANGUAGE'])
clf_LR_03_bigram.fit(train_03['TEXT'], train_03['LANGUAGE'])
clf_LR_03_trigram.fit(train_03['TEXT'], train_03['LANGUAGE'])
clf_LR_04_unigram.fit(train_04['TEXT'], train_04['LANGUAGE'])
clf_LR_04_bigram.fit(train_04['TEXT'], train_04['LANGUAGE'])
clf_LR_04_trigram.fit(train_04['TEXT'], train_04['LANGUAGE'])

## Predykcje dla regresji logistycznej

In [None]:
predicted_clf_LR_02_unigram = clf_LR_02_unigram.predict(test_02['TEXT'])
predicted_clf_LR_02_bigram = clf_LR_02_bigram.predict(test_02['TEXT'])
predicted_clf_LR_02_trigram = clf_LR_02_trigram.predict(test_02['TEXT'])
predicted_clf_LR_03_unigram = clf_LR_03_unigram.predict(test_03['TEXT'])
predicted_clf_LR_03_bigram = clf_LR_03_bigram.predict(test_03['TEXT'])
predicted_clf_LR_03_trigram = clf_LR_03_trigram.predict(test_03['TEXT'])
predicted_clf_LR_04_unigram = clf_LR_04_unigram.predict(test_04['TEXT'])
predicted_clf_LR_04_bigram = clf_LR_04_bigram.predict(test_04['TEXT'])
predicted_clf_LR_04_trigram = clf_LR_04_trigram.predict(test_04['TEXT'])

## Precision, recall, f1 i accuracy dla regresji logistycznej

In [None]:
print(metrics.classification_report(test_02['LANGUAGE'], predicted_clf_LR_02_unigram))

In [None]:
print(metrics.classification_report(test_02['LANGUAGE'], predicted_clf_LR_02_bigram))

In [None]:
print(metrics.classification_report(test_02['LANGUAGE'], predicted_clf_LR_02_trigram))

In [None]:
print(metrics.classification_report(test_03['LANGUAGE'], predicted_clf_LR_03_unigram))

In [None]:
print(metrics.classification_report(test_03['LANGUAGE'], predicted_clf_LR_03_bigram))

In [None]:
print(metrics.classification_report(test_03['LANGUAGE'], predicted_clf_LR_03_trigram))

In [None]:
print(metrics.classification_report(test_04['LANGUAGE'], predicted_clf_LR_04_unigram))

In [None]:
print(metrics.classification_report(test_04['LANGUAGE'], predicted_clf_LR_04_bigram))

In [None]:
print(metrics.classification_report(test_04['LANGUAGE'], predicted_clf_LR_04_trigram))

## Reprezentujemy tekst przy użyciu n-gramów, wykonujemy transformacje i dopasowujemy SVM

In [None]:
clf_SVM_02_unigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,1))),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None))])

clf_SVM_02_bigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(2,2))),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None))])

clf_SVM_02_trigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(3,3))),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None))])

clf_SVM_03_unigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,1))),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None))])

clf_SVM_03_bigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(2,2))),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None))])

clf_SVM_03_trigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(3,3))),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None))])

clf_SVM_04_unigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,1))),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None))])

clf_SVM_04_bigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(2,2))),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None))])

clf_SVM_04_trigram = Pipeline([
     ('vect', CountVectorizer(ngram_range=(3,3))),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None))])

clf_SVM_02_unigram.fit(train_02['TEXT'], train_02['LANGUAGE'])
clf_SVM_02_bigram.fit(train_02['TEXT'], train_02['LANGUAGE'])
clf_SVM_02_trigram.fit(train_02['TEXT'], train_02['LANGUAGE'])
clf_SVM_03_unigram.fit(train_03['TEXT'], train_03['LANGUAGE'])
clf_SVM_03_bigram.fit(train_03['TEXT'], train_03['LANGUAGE'])
clf_SVM_03_trigram.fit(train_03['TEXT'], train_03['LANGUAGE'])
clf_SVM_04_unigram.fit(train_04['TEXT'], train_04['LANGUAGE'])
clf_SVM_04_bigram.fit(train_04['TEXT'], train_04['LANGUAGE'])
clf_SVM_04_trigram.fit(train_04['TEXT'], train_04['LANGUAGE'])

## Predykcje dla SVM

In [None]:
predicted_clf_SVM_02_unigram = clf_SVM_02_unigram.predict(test_02['TEXT'])
predicted_clf_SVM_02_bigram = clf_SVM_02_bigram.predict(test_02['TEXT'])
predicted_clf_SVM_02_trigram = clf_SVM_02_trigram.predict(test_02['TEXT'])
predicted_clf_SVM_03_unigram = clf_SVM_03_unigram.predict(test_03['TEXT'])
predicted_clf_SVM_03_bigram = clf_SVM_03_bigram.predict(test_03['TEXT'])
predicted_clf_SVM_03_trigram = clf_SVM_03_trigram.predict(test_03['TEXT'])
predicted_clf_SVM_04_unigram = clf_SVM_04_unigram.predict(test_04['TEXT'])
predicted_clf_SVM_04_bigram = clf_SVM_04_bigram.predict(test_04['TEXT'])
predicted_clf_SVM_04_trigram = clf_SVM_04_trigram.predict(test_04['TEXT'])

## Precision, recall, f1 i accuracy dla SVM

In [None]:
print(metrics.classification_report(test_02['LANGUAGE'], predicted_clf_SVM_02_unigram))

In [None]:
print(metrics.classification_report(test_02['LANGUAGE'], predicted_clf_SVM_02_bigram))

In [None]:
print(metrics.classification_report(test_02['LANGUAGE'], predicted_clf_SVM_02_trigram))

In [None]:
print(metrics.classification_report(test_03['LANGUAGE'], predicted_clf_SVM_03_unigram))

In [None]:
print(metrics.classification_report(test_03['LANGUAGE'], predicted_clf_SVM_03_bigram))

In [None]:
print(metrics.classification_report(test_03['LANGUAGE'], predicted_clf_SVM_03_trigram))

In [None]:
print(metrics.classification_report(test_04['LANGUAGE'], predicted_clf_SVM_04_unigram))

In [None]:
print(metrics.classification_report(test_04['LANGUAGE'], predicted_clf_SVM_04_bigram))

In [None]:
print(metrics.classification_report(test_04['LANGUAGE'], predicted_clf_SVM_04_trigram))