# Mount to Google Drive

In [1]:
from google.colab import drive
drive_path = '/content/drive'
drive.mount(drive_path)

src_file = 'sample_dataset.csv'
path_to_file = '/Colab Notebooks/RUPP Tutorial/IDM/Assignment/'

src_filepath = drive_path + '/My Drive' + path_to_file + src_file

Mounted at /content/drive


In [2]:
# # Import pandas library
import pandas as pd
import numpy as np
# import re

# Load Data into Datafame

In [3]:
# Read Dataset
# dataset = 'sample_datasetcsv'
dataframe = pd.read_csv(src_filepath, encoding="utf8", sep='\t', quotechar=" ", engine='python', usecols=["TITLE", "CATEGORY"])
# dataframe = pd.read_csv(src_file, encoding="utf8", quotechar=" ", usecols=["TITLE", "CATEGORY"])

In [4]:
dataframe.columns

Index(['TITLE', 'CATEGORY'], dtype='object')

In [5]:
dataframe

Unnamed: 0,TITLE,CATEGORY
0,"Fed official says weak data caused by weather,...",b
1,Fed's Charles Plosser sees high bar for change...,b
2,US open: Stocks fall after Fed official hints ...,b
3,"Fed risks falling 'behind the curve', Charles ...",b
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b
5,Plosser: Fed May Have to Accelerate Tapering Pace,b
6,Fed's Plosser: Taper pace may be too slow,b
7,Fed's Plosser expects US unemployment to fall ...,b
8,US jobs growth last month hit by weather:Fed P...,b
9,ECB unlikely to end sterilisation of SMP purch...,b


# Data Preprocessing

In [6]:
# Preprocessing
#check for missing data
if(any(dataframe.isnull().any())):
    print('Missing Data\n')
    print(dataframe.isnull().sum())
else:
    print('NO missing data')

NO missing data


In [7]:
# check for duplicate
if(any(dataframe.duplicated())==True):
    print('Duplicate rows found')
    print('Number of duplicate rows= ', dataframe[dataframe.duplicated()].shape[0])
    dataframe.drop_duplicates(inplace=True,keep='first')
    dataframe.reset_index(inplace=True,drop=True)
    print('Dropping duplicates\n')
    print(dataframe.shape)
else:
    print('NO duplicate data')

NO duplicate data


In [8]:
# download the library to for the nltk functions to use in the cleaning process
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
from sklearn import set_config
set_config(transform_output="pandas")

wnl = WordNetLemmatizer()

# Function for cleaning and tokenize the headline
def tokenize(doc):
  document = doc.lower() # convert the content of the headline to lowercase
  document = re.sub(r'\d+', '', document) # remove all of the digits inside of the content (using regular expressions)
  document = document.translate(str.maketrans('', '', string.punctuation)) # remove the puntuations (, . ! # ...)
  document = document.strip() # remove the spaces at the start and end of the headline
  return [wnl.lemmatize(token) for token in word_tokenize(document) if token not in stopwords.words('english')]
  # tokenize the headlines
  # and then filter only the words that are not in the english stopwords (words that are commonly used and give no benifits to the classifier)
  # and finally lemmatize all of the tokens

# The preprocess pipeline
preprocessor = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)), # passing custom tokenizer method for the CountVectorizer to use
    ('tfidf', TfidfTransformer()),
])

tfidf_dataset = preprocessor.fit_transform(dataframe["TITLE"].values) # process the training dataset
# tfidf_test = preprocessor.transform(X_test.values) # process the testing dataset



In [10]:
# _test = pd.DataFrame(tfidf_dataset.toarray())

In [11]:
# _test

In [13]:
# Save dataset with extracted feature
# save_path = drive_path + '/My Drive' + path_to_file + "dataset_feature.csv"
# _test.to_csv(save_path)

# Training Model

## Label encoder

In [14]:
from tkinter.constants import Y
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
class_label = le.fit_transform(dataframe["CATEGORY"])
# list(le.classes_)
class_label

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    tfidf_dataset.toarray(),
    class_label,
    test_size = 0.3 # the size of the testing dataset (in percentage between 0 and 1)
)

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#Decision Tree
DTClass = DecisionTreeClassifier(criterion="gini", splitter="best", random_state=42)
DTClass.fit(X_train, y_train)
y_pred = DTClass.predict(X_test)

print("accuracy score of Decision Tree:")
print(accuracy_score(y_test, y_pred))

accuracy score of Decision Tree:
1.0


In [17]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

