## A simple model based on logistic regression for determining if the comments are useful or not.

Import libraries and parse CSV data --> include only the comment strings and their labels (non-information)

In [106]:
from helpers.data_loader import DataLoader
from helpers.data_preprocessing import DataProcesser
from helpers.feature_helper import FeatureHelper
from text_representation.text_representation_factory import TextRepresentationFactory
from helpers.text_similarity import TextSimilarity

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, normalize, scale
from scipy import sparse

data = DataLoader.load_csv_file("./../data/train_set_0520.csv", ['type', 'comment', 'non-information'])
code = DataLoader.load_csv_file("./../data/code_data.csv", ['code'])
data['code'] = code['code']
data['code'] = data['code'].apply(str)
data['comment'] = data['comment'].apply(str)
data['code'] = data['code'].apply(DataProcesser.preprocess_code)
code = data['code']
values = data['non-information'].values
values = np.where(values == 'yes', 1, 0)

Preprocess data:
1. remove all special characters
2. TODO: expand contradictions (don't = do not etc)
3. remove special characters
4. stemming --> put the word into its most basic form
5. lemmatisation --> removes the word's affixes to get to the basic form of the word

Observations: Removing stopwords decreased accuracy

In [107]:
data['comment'] = data['comment'].apply(str)
java_tags_ratio = data.apply(lambda row: FeatureHelper.get_java_tags_ratio(row['comment']), axis=1).to_numpy()

data['comment'] = data['comment']
data['comment'] = data['comment'].apply(DataProcesser.preprocess)
comments = data['comment']

Split the comments into train (used for training the model) and test data (used for evaluating the model).

In [108]:
from sklearn.model_selection import train_test_split
comments_train, comments_test, y_train, y_test = train_test_split(comments, values, test_size=0.25, random_state=1000)
comments_train


431                                  open a share databas
213     execut a callabl task that provid a valu after...
730     copi the select entri and mat them with the se...
1005                               auto gener method stub
130     keep track of chang made to the column like re...
                              ...                        
769                           test the field is legal set
350     initi the compon the layout the data structur ...
1275    panel getunmanag addedit unablemovegroup group...
71      to remov old en or add it to a list of entri t...
599     need to toggl a twice to make sure everyth is ...
Name: comment, Length: 983, dtype: object

Vectorise data - map a numerical value to each word

This is based on the Bag-of-Words Model. It ignores the order of words and focuses only on their frequency.

In [109]:
#TODO add length after count vectoriser --> seperate into files
from sklearn.feature_extraction.text import CountVectorizer
text_representation = TextRepresentationFactory.get_text_representation('BOW')
comments = text_representation.vectorize(comments)
df = text_representation.print(comments)
df = df.iloc[:3]

newdf = pd.DataFrame()
for column in df:
    total = df[column].sum()
    if total > 0 :
        newdf[column] = df[column]

comments[:2]

<2x1824 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>