In [1]:
run_data_cleaning = True
testing = True # will only run the first 10 tuples

In [2]:
import numpy as np

# Data Extraction
import pandas as pd

# SentiStrength
import subprocess
import shlex
import os.path
import sys

# Machine Learning
import tensorflow as tf
import sklearn

# 1. Data <a class="anchor" id="data"></a>

## 1.1. Data Extraction <a class="anchor" id="data-extraction"></a>

In [3]:
def get_data(path):

    # read in data
    df = pd.read_csv(path, sep=',', encoding='ISO-8859-1', header=None)
    data = np.array(df)
        
    return data

## 1.2. Manual Feature Selection <a class="anchor" id="manual-feature-selection"></a>

In [4]:
def manually_selected_features(data):
    
    print("> Getting manually selecting features...")
    
    # cols to keep: 1, 5, 6, 13, 14, 19
    # we're also keeping the priority column (5) for now
    cols_to_delete = (0, 2, 3, 4, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 20, 21,
                        22, 23, 24, 25, 26, 27, 28, 29, 30, 31)
    
    data = np.delete(data, cols_to_delete, axis=1)
    
    return data

## 1.3. Data Cleaning <a class="anchor" id="data-cleaning"></a>

In [5]:
def clean_data(data):
        
    # ---------- Only keep columns selected manually ----------
    
    print('> Cleaning data...')
    print("\n  Tuples before data cleaning: " + str(data[1:].shape[0]) + '\n')
    data = manually_selected_features(data)
    
    # ---------- Remove rows where data is missing ----------
    
    rows_to_delete = []

    for i, row in enumerate(data):
        for j, val in enumerate(row):
            if (str(row[j]).strip() == 'null'):
                # print("deleting row " + str(i) + ": " + str(row))
                rows_to_delete.append(i)
                break
    
    data = np.delete(data, rows_to_delete, 0)
    # np.savetxt('../dataset/all_data_null_removed.csv', data, delimiter=',', fmt="%s")

    print("\n  Tuples after data cleaning: " + str(data[1:].shape[0]) + '\n')
        
    # ---------- Split total data into design matrix and feature headers ----------
        
    # strip white space from features array and ignore headers in data matrix
    feature_headers = [str(header).strip() for header in  data[0]] # remove white space around strings
    data = data[1:] # excluding headers from data matrix
    
    # transform labels into integer encodings
    labels = [str(val).strip() for val in  data[:,1]]
    labels = LabelEncoder().fit_transform(labels)
    
    data = np.delete(data, 1, 1) # deleting labels column from data matrix
    data = np.c_[data, labels] # add labels column to the end
    
    # remove "priority" header - these are the labels, and have already been extracted.
    feature_headers=np.delete(feature_headers, 1)
    
    # Quantify issue "type" (0) and "reporter" (1)
    data[:,0] = quantify_to_int(data[:,0])
    data[:,1] = quantify_to_int(data[:,1])

    # Apply sentiment analysis to "summary" (2) and "description" (3) features
    data[:,2] = get_sentiment_feature(data[:,2])
    data[:,3] = get_sentiment_feature(data[:,3])
    
    # Convert "description_words" (4) from strings to integers
    data[:,4] = [int(words) for words in data[:,4]]

    return data, feature_headers

## 1.4. Generating New Features <a class="anchor" id="generating-new-features"></a>

### 1.4.1. Sentiment Analysis <a class="anchor" id="sentiment-analysis"></a>

In [6]:
# allows SentiStrength to be called and ran on a single line of text.
def rate_sentiment(senti_string):
    
    if senti_string == '': return 0
    
    # Set the proper paths
    sentistrength_location = "./resources/SentiStrength/SentiStrength.jar" # The location of SentiStrength on your computer
    sentistrength_language_folder = "./resources/SentiStrength/data/" # The location of the unzipped SentiStrength data files on your computer
    
    # Tests the paths are correct.
    # An error will be displayed if there is an issue.
    if not os.path.isfile(sentistrength_location):
        print("SentiStrength not found at: ", sentistrength_location)
    if not os.path.isdir(sentistrength_language_folder):
        print("SentiStrength data folder not found at: ", sentistrength_language_folder)
       
    # Open a subprocess using shlex to get the command line string into the correct args list format
    p = subprocess.Popen(shlex.split("java -jar '" + sentistrength_location + "' stdin sentidata '" + sentistrength_language_folder + "'"),stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
    # Communicate via stdin the string to be rated. Note that all spaces are replaced with "+"
    b = bytes(senti_string.replace(" ","+"), 'utf-8') # Can't send string in Python 3, must send bytes
    stdout_byte, stderr_text = p.communicate(b)
    stdout_text = stdout_byte.decode("utf-8")  # Convert from byte
    # -------- Edit - Nov 9 2017 --------
    stdout_list = stdout_text.split("\t")      # Split by tab: ['2', '-1','\n']
    del stdout_list[-1]                        # Get rid of the last newline element: ['2', '-1']
    results = list(map(int, stdout_list))      # Convert the characters to integers
    results = results[0] + results[1]          # Combine the positive and the negative
    # -------- END: Edit - Nov 9 2017 --------
    #stdout_text = stdout_text.rstrip().replace("\t"," ") # Remove the tab spacing between the positive and negative ratings. e.g. 1    -5 -> 1 -5
    #return stdout_text + " " + senti_string
    
    return results

In [7]:
# Test to ensure that it works correctly
print(rate_sentiment(""))

0


Given a column of the data, return another that will include a representation of the input.

In [8]:
# https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█'):
    
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r')
    
    # Print New Line on Complete
    if iteration == total: 
        print()

In [9]:
def get_sentiment_feature(strings):
    
    print("> Applying sentiment analysis...")
    l = len(strings)
    results = np.zeros(l)
    
    # Initial call to print 0% progress
    printProgressBar(0, l, prefix = '  Progress:', suffix = 'Complete', length = 50)
    
    for i, element in enumerate(strings):
        results[i] = rate_sentiment(element.strip())
        printProgressBar(i + 1, l, prefix = '  Progress:', suffix = 'Complete', length = 50)       
    
    return results

### 1.4.3. Quantify Features <a class="anchor" id="quantify-features"></a>

In [10]:
from sklearn.preprocessing import LabelEncoder
def quantify_to_int(array):
    
    print("> Quantifying feature...")

    label_encoder = LabelEncoder()
    results = label_encoder.fit_transform(array)
                
    return results

### 1.4.4. Adjust Given Labels

The labels are provided in string format; however, we will need to convert them into one_hot vectors in order to use them as different classes in the Neural Network.

In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
def one_hot(array):
    
    print("> Transforming labels into one-hot vectors...")
    
    onehot_encoder = OneHotEncoder(sparse=False)
    
    # assuming array has already been transformed into integer encodings
    # now, convert to binary (one-hot)
    array = array.reshape(len(array), 1)
    results = onehot_encoder.fit_transform(array)
            
    return results

# 2. Implementation <a class="anchor" id="implementation"></a>

## 2.1 Fetch and Clean Data

In [12]:
all_data_path = "../dataset/all_data.csv"
clean_data_path = "../dataset/clean_data.csv"

In [13]:
rows = 10
if (not testing): rows=4000

data, feature_headers = clean_data(get_data(all_data_path)[:rows + 1]) # +1 to include headers

# Saving the clean data into a csv file for future use
np.savetxt(clean_data_path, data, delimiter=',')

tuples_to_print = 5
print("\n  Features considered: " + str(feature_headers) + " = " + str(len(feature_headers)))
print("\n  First " + str(tuples_to_print) + " tuples example:\n" + str(data[:tuples_to_print]))

> Cleaning data...

  Tuples before data cleaning: 10

> Getting manually selecting features...

  Tuples after data cleaning: 10

> Quantifying feature...
> Quantifying feature...
> Applying sentiment analysis...
  Progress: |██████████████████████████████████████████████████| 100.0% Complete
> Applying sentiment analysis...
  Progress: |██████████████████████████████████████████████████| 100.0% Complete

  Features considered: ['type' 'reporter' 'summary' 'description' 'description_words'] = 5

  First 5 tuples example:
[[1 3 0.0 0.0 1 4]
 [0 9 -2.0 0.0 245 2]
 [1 5 0.0 1.0 36 2]
 [0 2 0.0 0.0 17 1]
 [1 4 0.0 2.0 23 2]]


## 2.2. Fetch Clean Data

In [14]:
from sklearn.model_selection import train_test_split
def split_data(data, labels, train_perc):
    
    test_perc = round(1-train_perc, 2)
    x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=train_perc, test_size=test_perc, random_state=42)

    return x_train, x_test, y_train, y_test

In [15]:
df = pd.read_csv(clean_data_path, sep=',', encoding='ISO-8859-1', header=None)
clean_data = np.array(df)

# get rid of rows containing "nan" in clean data file
rows_to_delete = []
for i, row in enumerate(clean_data):
    for j, val in enumerate(row):
        if (str(row[j]).strip() == 'nan'):
            print("> Deleting row: " + str(row))
            rows_to_delete.append(i)
            break
clean_data = np.delete(clean_data, rows_to_delete, 0)

# don't include the last column; where the labels are
data = (clean_data[:,:-1])

# reshape from (m,) to (m,1), then convert into one-hot vector (m,k)
y = one_hot((clean_data[:,-1]).reshape((-1, 1)))
print("\n  data matrix shape: " + str(data.shape))
print("  labels (y) shape: " + str(y.shape) + '\n')

train_perc = .7 # percentage of total data used for training
x_train, x_test, y_train, y_test = split_data(data, y, train_perc) # randomly splitting up the data
m = x_train.shape[0] # number of tuples for training
n = data.shape[1] # number of features
k = len(y[0]) # number of classes

print("> m (training samples) = " + str(m) + "\n> n (num. features)= " + str(n) + "\n> k (num. classes) = " + str(k))

> Transforming labels into one-hot vectors...

  data matrix shape: (10, 5)
  labels (y) shape: (10, 5)

> m (training samples) = 7
> n (num. features)= 5
> k (num. classes) = 5


In [16]:
y_rand = one_hot(np.floor(np.random.rand(len(y_test),1)*5).astype(int))
print("\n  y_rand shape: " + str(y_rand.shape))

> Transforming labels into one-hot vectors...

  y_rand shape: (3, 2)
