# Εργασία 2 (Τεχνικές Εξόρυξης Δεδομένων)
## Data Mining: Assignment 2
***
### Μαρία Φριτζελά 1115201400218
***

In [None]:
import glob
import csv
import re
import pandas as pd
import numpy as np
from itertools import chain
from IPython.core.display import display
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import  svm, metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

Κατηγοριοποίηση δεδομένων κειμένου από ειδησιογραφικά άρθρα 5 κατηγοριών:</br>
Classifying text data from articles of 5 different categories:

In [None]:
categories = ['business', 'entertainment', 'politics', 'sport', 'tech']

### Get names of files for testing and training
Data set consists of 2225 documents from a news website
corresponding to stories in five topical areas from 2004-2005.</br>
</br>
80% of data points (files) will be used for training, the remaining 20% will be used for testing.</br>
We will be collecting the files names as elements in two lists (one for each purpose)

In [None]:
files_path = 'fulltext/data/'
train_files = []    # list of files names that will be used for training
test_files = []     # list of files names that will be used for testing

for category in categories:
    # get all txt files names from current category
    files = glob.glob(files_path+category+'/*.txt')
    # sort them alphabetically
    files.sort()
    # separate list so that first 80% will be copied into the train_files list
    sep_index = round(len(files) * 0.8)
    train_files.extend(files[:sep_index])
    test_files.extend(files[sep_index:])

Result:

In [None]:
print("Total files: 2225")
print("# of train files: "+str(len(train_files)))
print("# of test files: "+str(len(test_files)))

### Create file train_set.tsv
Columns: id, title, content, category <br>
_The id is the name of the text file with the first letter of the category prepended (ex. "b001")_

In [None]:
with open('train_set.tsv', 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    # write header row
    tsv_writer.writerow(['id', 'title', 'content', 'category'])
    # write rows for all other files
    for file_path in train_files:
        with open(file_path) as f:
            # read all lines of file split them in a list removing '\n'
            # iterate through elements in list removing empty ones (empty strings are False)
            try:
                content = [line for line in f.read().splitlines() if line.strip()]
                # unpack the information we need from the file path
                _, _, cat, name, _ = file_path.replace('.', '/').split('/')
                # Write row
                tsv_writer.writerow([cat[0]+name,
                                 content.pop(0),
                                 " ".join(content),
                                 cat])
            except UnicodeDecodeError:
                print("UnicodeDecodeError for file: "+file_path+". File skipped")


### Create file test_set.tsv
Columns: id, title, content

In [None]:
with open('test_set.tsv', 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    # write header row
    tsv_writer.writerow(['id', 'title', 'content'])
    # write rows for all other files
    for file_path in test_files:
        with open(file_path) as f:
            # read all lines of file split them in a list removing '\n'
            # iterate through elements in list removing empty ones (empty strings are False)
            try:
                content = [line for line in f.read().splitlines() if line.strip()]
                # unpack the information we need from the file path
                _, _, cat, name, _ = file_path.replace('.', '/').split('/')
                # Write row
                tsv_writer.writerow([cat[0]+name,
                                 content.pop(0),
                                 " ".join(content)])
            except UnicodeDecodeError:
                print("UnicodeDecodeError for file: "+file_path+". File skipped")

### Create a file with the full dataset: dataset.tsv


In [None]:
with open('dataset.tsv', 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    # write header row
    tsv_writer.writerow(['id', 'title', 'content', 'category'])
    # write rows for all other files
    for file_path in chain(train_files, test_files):
        with open(file_path) as f:
            # read all lines of file split them in a list removing '\n'
            # iterate through elements in list removing empty ones (empty strings are False)
            try:
                content = [line for line in f.read().splitlines() if line.strip()]
                # unpack the information we need from the file path
                _, _, cat, name, _ = file_path.replace('.', '/').split('/')
                # Write row
                tsv_writer.writerow([cat[0]+name,
                                 content.pop(0),
                                 " ".join(content),
                                 cat])
            except UnicodeDecodeError:
                print("UnicodeDecodeError for file: "+file_path+". File skipped")

Create a DataFrame for the data_set (id column as the index)

In [None]:
#testdf = pd.read_csv("test_set.tsv", sep='\t', index_col='id')
#traindf = pd.read_csv("train_set.tsv", sep='\t', index_col='id')
datadf = pd.read_csv("dataset.tsv", sep='\t', index_col='id')
datadf

## 1 Δημιουργία WordCloud
**Create a WordCloud for the articles of each category**

In [None]:
# Create our own stopWord list:
stopwords = set(STOPWORDS)
stopwords.update(['say', 'said', 'saying', 'will', 'many', 'new', 'people', 'now', 'one'])

### Business

In [None]:
#create wordcloud
# select rows where the id contains 'b' (=business) using filter
wordcloud_business = WordCloud(
    width = 800,
    height = 800,
    background_color = 'black',
    stopwords = stopwords).generate(" ".join(title+' '+content
                                             for title, content in datadf.filter(like='b', axis=0)
                                                                   [['title', 'content']].values.tolist()))

In [None]:
business_image = wordcloud_business.to_image()
display(business_image)

### Entertainment

In [None]:
wordcloud_entertainment = WordCloud(
    width = 800,
    height = 800,
    background_color = 'black',
    stopwords = stopwords).generate(str(datadf.filter(like='e', axis=0)[['title', 'content']].values))

In [None]:
#show it
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud_entertainment)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

### Politics

In [None]:
wordcloud_politics = WordCloud(
    width = 800,
    height = 800,
    background_color = 'black',
    stopwords = stopwords).generate(str(datadf.filter(like='p', axis=0)[['title', 'content']].values))

In [None]:
#show it
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud_politics)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

### Sport

In [None]:
wordcloud_sport = WordCloud(
    width = 800,
    height = 800,
    background_color = 'black',
    stopwords = stopwords).generate(str(datadf.filter(like='s', axis=0)[['title', 'content']].values))

In [None]:
#show it
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud_sport)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

### Tech

In [None]:
wordcloud_tech = WordCloud(
    width = 800,
    height = 800,
    background_color = 'black',
    stopwords = stopwords).generate(str(datadf.filter(like='t', axis=0)[['title', 'content']].values))

In [None]:
#show it
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud_tech)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

## 2 Υλοποίηση Κατηγοριοποίησης (Classification)
**Data Classification**

### A) Cleaning and Pre-processing the data
Create a Pandas Series, adding it as a new row into the datadf,
by concatenating the title and content column of the datadf <br>
Clean up text:<br>
- Add a space before performing the sum to not connect words together accidentally
- Make all the words lower case to facilitate clean up, using `.lower`
- Remove our list of stopwords
- Remove punctuation and special characters using `re.sub`
- Remove all words containing digits, and any digits using `re.sub`.

In [None]:
datadf['text'] = datadf[['title', 'content']]\
    .apply(lambda row: ' '.join(row.values.astype(str)), axis=1)\
    .apply(lambda item: list(filter(lambda word: word not in stopwords, item.lower().split())))\
    .apply(lambda item: re.sub('[^A-Za-z0-9 ]+', '', ' '.join(item)))\
    .apply(lambda item: re.sub(r'\w*\d\w*', '', item))


#### Bag-of-words
Create bag-of-words vector

In [None]:
bow_vectorizer = CountVectorizer(max_features=3000, stop_words='english')

bow_X = bow_vectorizer.fit_transform(datadf.text)


In [None]:
pd.DataFrame(bow_X[0:1].T.todense(), index=bow_vectorizer.get_feature_names(), columns=["counts"])\
.sort_values(by=["counts"],ascending=False)

#### TF-IDF

In [None]:
tfidf_vectorizer= TfidfVectorizer(ngram_range=(1,2), max_features=3000, stop_words='english')

tfidf_X = tfidf_vectorizer.fit_transform(datadf.text)

In [None]:
pd.DataFrame(tfidf_X[0:1].T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])\
.sort_values(by=["tfidf"],ascending=False)

Seperate data into train (80%) and test (20%) set <br>
Use the stratify parameter to ensure that the split between the different categories is done equally

In [None]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(bow_X, datadf.category,
                                                                test_size=0.2, stratify=datadf.category)

X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(tfidf_X, datadf.category,
                                                     test_size=0.2, stratify=datadf.category)

### B) Classification

#### Support Vector Machines (SVM)

In [None]:
#instantiate the model
svm_clf = svm.SVC()

# train the model on the BoW training set
svm_clf.fit(X_train_bow, y_train_bow)
# predict the BoW test set
y_pred_svm_bow = svm_clf.predict(X_test_bow)

# train the model on the TF/IDF training set (previous weights and variables are reset)
svm_clf.fit(X_train_tfidf, y_train_tfidf)
# predict the TF/IDF test set
y_pred_svm_tfidf = svm_clf.predict(X_test_tfidf)

#### Random Forest

In [None]:
# Instantiate the model
rf = RandomForestClassifier()

# Train the model on the BoW training set
rf.fit(X_train_bow, y_train_bow)
# predict the BoW test set
y_pred_rf_bow = rf.predict(X_test_bow)

# train the model on the TF/IDF training set (previous weights and variables are reset)
rf.fit(X_train_tfidf, y_train_tfidf)
# predict the TF/IDF test set
y_pred_rf_tfidf = rf.predict(X_test_tfidf)

#### Naive Bayes

In [None]:
# Instantiate the model
nb = GaussianNB()

# Train the model on the BoW training set
nb.fit(X_train_bow.toarray(), y_train_bow)
# predict the BoW test set
y_pred_nb_bow = nb.predict(X_test_bow.toarray())

# Train the model on the TF/IDF training set (previous weights and variables are reset)
nb.fit(X_train_tfidf.toarray(), y_train_tfidf)
# predict the TF/IDF test set
y_pred_nb_tfidf = nb.predict(X_test_tfidf.toarray())

#### Precision Scores

In [None]:
# Model Precision: what percentage was classified correctly?
print("Precision SVM for BoW:",metrics.precision_score(y_test_bow, y_pred_svm_bow, average=None))
print("Precision SVM for TF/IDF:",metrics.precision_score(y_test_tfidf, y_pred_svm_tfidf, average=None))
print("Precision RF for BoW:",metrics.precision_score(y_test_bow, y_pred_rf_bow, average=None))
print("Precision RF for TF/IDF:",metrics.precision_score(y_test_tfidf, y_pred_rf_tfidf, average=None))
print("Precision NB for BoW:",metrics.precision_score(y_test_bow, y_pred_nb_bow, average=None))
print("Precision NB for TF/IDF:",metrics.precision_score(y_test_tfidf, y_pred_nb_tfidf, average=None))

# Model Recall
print("Recall SVM for BoW:",metrics.recall_score(y_test_bow, y_pred_svm_bow, average=None))
print("Recall SVM for TF/IDF:",metrics.recall_score(y_test_tfidf, y_pred_svm_tfidf, average=None))
print("Recall RF for BoW:",metrics.recall_score(y_test_bow, y_pred_rf_bow, average=None))
print("Recall RF for TF/IDF:",metrics.recall_score(y_test_tfidf, y_pred_rf_tfidf, average=None))
print("Recall NB for BoW:",metrics.recall_score(y_test_bow, y_pred_nb_bow, average=None))
print("Recall NB for TF/IDF:",metrics.recall_score(y_test_tfidf, y_pred_nb_tfidf, average=None))

# F-Measure
print("F-Measure SVM for BoW:", metrics.f1_score(y_test_bow, y_pred_svm_bow, average=None))
print("F-Measure SVM for TF/IDF:", metrics.f1_score(y_test_tfidf, y_pred_svm_tfidf, average=None))
print("F-Measure RF for BoW:", metrics.f1_score(y_test_bow, y_pred_rf_bow, average=None))
print("F-Measure RF for TF/IDF:", metrics.f1_score(y_test_tfidf, y_pred_rf_tfidf, average=None))
print("F-Measure NB for BoW:", metrics.f1_score(y_test_bow, y_pred_nb_bow, average=None))
print("F-Measure NB for TF/IDF:", metrics.f1_score(y_test_tfidf, y_pred_nb_tfidf, average=None))

print()
# compare actual response values (y_test) with predicted response values (y_pred)
print("Accuracy SVM for BoW:",metrics.accuracy_score(y_test_bow, y_pred_svm_bow))
print("Accuracy SVM for TF/IDF:",metrics.accuracy_score(y_test_tfidf, y_pred_svm_tfidf))
print("Accuracy RF for BoW:",metrics.accuracy_score(y_test_bow, y_pred_rf_bow))
print("Accuracy RF for TF/IDF:",metrics.accuracy_score(y_test_tfidf, y_pred_rf_tfidf))
print("Accuracy NB for BoW:",metrics.accuracy_score(y_test_bow, y_pred_nb_bow))
print("Accuracy NB for TF/IDF:",metrics.accuracy_score(y_test_tfidf, y_pred_nb_tfidf))

In [None]:
print(np.mean(cross_val_score(svm_clf, X_train_tfidf, y_train_tfidf, cv=10)))

In [None]:
np.mean(cross_val_score(svm_clf, X_train_tfidf, y_train_tfidf, cv=10, scoring='precision_macro'))

#### K-Nearest Neighbor