In [0]:
# import packages
import csv
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier

warnings.filterwarnings('ignore')

In [0]:
# read .tsv files
columns = {0:'ID', 1:'label', 2:'statement', 3:'subject', 4:'speaker', 5:'job_title',
          6:'state', 7:'party', 8:'barely_true', 9:'false', 10:'half_true', 11:'mostly_true',
          12:'pants_on_fire', 13:'context'}

def readTsvFile(file_name):
  tsv_file = open(file_name)
  read_tsv = csv.reader(tsv_file, delimiter='\t')
  dataset = []
  for row in read_tsv:
    dataset.append(row)

  # print('examples:', len(dataset))
  # print('features:', len(dataset[0]))
  # print('row1_example:', dataset[0])
  # print('\n')

  return dataset

In [0]:
# plotting bar charts
def plottingData(column_name, arr_X, arr_y):
  fig = plt.figure()
  ax = fig.add_axes([1, 1, 2, 2])
  ax.bar(arr_X, arr_y)
  ax.set_xlabel(column_name)
  ax.set_ylabel('count')
  plt.xticks(rotation=90)
  plt.show()

In [0]:
def removeSpace(string):
  if string[-1] != ' ':
    return string
  space_count = 0
  pos = -1
  while string[pos] == ' ':
    pos -= 1
  
  return string[:pos+1]

In [0]:
#caculate numbers of each category in each columns
def categoryChecker(dataset, column_name, n):  
  category = {}

  for row in dataset:
    if len(row) > n:
      if row[n] == '' or row[n] == 'N/A':
        row[n] = 'None' #missing data will rename as 'None'
      cleaned_string = removeSpace(row[n])
      if cleaned_string in category:     
        category[cleaned_string] += 1 
      if cleaned_string not in category:
        category[cleaned_string] = 1
    # else:
    #   print('Suspicious case:', row[0]) #len(row) <= n?
  
  category = {k: v for k, v in sorted(category.items(), key=lambda x: x[1],
                                      reverse=True)} #sorting dictionary

  count = 0
  arr_X = []
  arr_y = []
  for k, v in category.items():
    count += v
    arr_X.append(k)
    arr_y.append(v)

  # print(column_name, ':', category)  
  # print('The total number of examples:', count)
  # print('The number of categories:', len(arr_X))
  plottingData(column_name, arr_X, arr_y) #call plotting function

In [0]:
def dataVisualization(dataset):
  for i in [1, 6, 7]:
    categoryChecker(dataset, columns[i], i)

In [0]:
def getStatement(dataset, column_num):
  statement = []

  for row in dataset:
    if len(row) < column_num:
      statement.append('')
      # print(row[0])
    else:
      statement.append(row[column_num])
  
  return statement

In [0]:
def trainRunVectorizer(dataset_words):  
  cv = CountVectorizer(stop_words='english')
  doc = np.array([dataset_words])
  dataset_cv = cv.fit_transform(doc.ravel())

  # print(cv.vocabulary_)
  # print(dataset_cv.toarray())
  # print(dataset_cv.shape)
  return dataset_cv, cv

In [0]:
def runVectorizer(dataset_words, cv):  
  doc = np.array([dataset_words])
  dataset_cv = cv.transform(doc.ravel())

  # print(cv.vocabulary_)
  # print(dataset_cv.toarray())
  # print(dataset_cv.shape)
  return dataset_cv

In [0]:
def runTfidfTransformer(vectorized_statement):
  tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
  np.set_printoptions(precision=2)
  tfidf_transformed = tfidf.fit_transform(vectorized_statement)
  tfidf_transformed_array = tfidf_transformed.toarray()

  return tfidf_transformed_array

In [0]:
def categorizedDataset(dataset, column_nums=[3, 4, 5, 6]):
  pre_categorized_dataset = []
  for column_num in column_nums:
    seen = {}
    categorized_row = []
    counter = 0
    for row in dataset:
      if len(row) > column_num:
        data = row[column_num]
      else:
        data = ''
      
      if data in seen:
        categorized_row.append(seen[data])
      if data not in seen:
        seen[data] = counter
        categorized_row.append(seen[data])
        counter += 1

    pre_categorized_dataset.append(categorized_row)
  
  categorized_dataset = np.array(pre_categorized_dataset).transpose()

  return categorized_dataset

In [0]:
def creditHistory(dataset, column_nums=[8, 9, 10, 11, 12]):
  pre_credit_history_dataset = []
  for row in dataset:
    credit_row = []
    for column_num in column_nums:
      if len(row) > column_num:
        data = row[column_num]
      else:
        data = 0
      credit_row.append(data)

    pre_credit_history_dataset.append(credit_row)
  
  credit_history_dataset = np.array(pre_credit_history_dataset)

  return credit_history_dataset

In [0]:
def getTargetDataset(dataset, column_num = 1):
  pre_target_dataset = []
  seen = {}
  counter = 0
  for row in dataset:
    if row[column_num] in seen:
        pre_target_dataset.append(seen[row[column_num]])
    if row[column_num] not in seen:
        seen[row[column_num]] = counter
        pre_target_dataset.append(seen[row[column_num]])
        counter += 1

  target_dataset = np.array(pre_target_dataset).transpose()

  return target_dataset

In [0]:
def runTrainDataset():
  train_dataset = readTsvFile('train.tsv')
  # dataVisualization(train_dataset)

  y_train = getTargetDataset(train_dataset)

  train_dataset_statement = getStatement(train_dataset, 2)
  train_dataset_context = getStatement(train_dataset, 13)

  categorized_train_dataset = categorizedDataset(train_dataset)
  credit_history_train_dataset = creditHistory(train_dataset)
  del train_dataset

  train_vectorized_statement, cv_statement = trainRunVectorizer(train_dataset_statement)
  train_vectorized_context, cv_context = trainRunVectorizer(train_dataset_context)
  del train_dataset_statement
  del train_dataset_context

  train_tfidfed_statement = runTfidfTransformer(train_vectorized_statement)
  train_tfidfed_context = runTfidfTransformer(train_vectorized_context)
  del train_vectorized_statement
  del train_vectorized_context

  train_vectrized_features = np.column_stack((train_tfidfed_statement, train_tfidfed_context))
  del train_tfidfed_statement
  del train_tfidfed_context

  X_train = np.column_stack((train_vectrized_features, categorized_train_dataset))
  del train_vectrized_features
  del categorized_train_dataset

  X_train = np.column_stack((X_train, credit_history_train_dataset))
  del credit_history_train_dataset

  return X_train, y_train, cv_statement, cv_context

In [0]:
def runValDataset(cv_statement, cv_context): 
  val_dataset = readTsvFile('valid.tsv')
  # dataVisualization(val_dataset)

  y_val = getTargetDataset(val_dataset)

  val_dataset_statement = getStatement(val_dataset, 2)
  val_dataset_context = getStatement(val_dataset, 13)

  categorized_val_dataset = categorizedDataset(val_dataset)
  credit_history_val_dataset = creditHistory(val_dataset)
  val_dataset = None

  val_vectorized_statement = runVectorizer(val_dataset_statement, cv_statement)
  val_vectorized_context = runVectorizer(val_dataset_context, cv_context)
  val_dataset_statement = val_dataset_context = None

  val_tfidfed_statement = runTfidfTransformer(val_vectorized_statement)
  val_tfidfed_context = runTfidfTransformer(val_vectorized_context)
  val_vectorized_statement = val_vectorized_context = None

  val_vectrized_features = np.column_stack((val_tfidfed_statement, val_tfidfed_context))
  val_tfidfed_statement = val_tfidfed_context = None


  X_val = np.column_stack((val_vectrized_features, categorized_val_dataset))
  val_vectrized_features = categorized_val_dataset = None

  X_val = np.column_stack((X_val, credit_history_val_dataset))
  credit_history_val_dataset = None

  return X_val, y_val

In [0]:
def runTestDataset(cv_statement, cv_context):
  test_dataset = readTsvFile('test.tsv')
  # dataVisualization(test_dataset)

  y_test = getTargetDataset(test_dataset)

  test_dataset_statement = getStatement(test_dataset, 2)
  test_dataset_context = getStatement(test_dataset, 13)

  categorized_test_dataset = categorizedDataset(test_dataset)
  credit_history_test_dataset = creditHistory(test_dataset)
  test_dataset = None

  test_vectorized_statement = runVectorizer(test_dataset_statement, cv_statement)
  test_vectorized_context = runVectorizer(test_dataset_context, cv_context)
  test_dataset_statement = test_dataset_context = None

  test_tfidfed_statement = runTfidfTransformer(test_vectorized_statement)
  test_tfidfed_context = runTfidfTransformer(test_vectorized_context)
  test_vectorized_statement = test_vectorized_context = None

  test_vectrized_features = np.column_stack((test_tfidfed_statement, test_tfidfed_context))
  test_tfidfed_statement = test_tfidfed_context = None

  X_test = np.column_stack((test_vectrized_features, categorized_test_dataset))
  test_vectrized_features = categorized_test_dataset = None

  X_test = np.column_stack((X_test, credit_history_test_dataset))
  credit_history_test_dataset = None

  return X_test, y_test

In [0]:
X_train, y_train, cv_statement, cv_context = runTrainDataset()

In [0]:
X_val, y_val = runValDataset(cv_statement, cv_context)

In [0]:
def trainModel(X_train, X_val, y_train, y_val, layers, actives, rates):
  best_val_acc = 0
  best_layer = []
  best_active = []
  best_rate = []
  accuracies = 0
  counter = 0

  for layer in layers:
    for active in actives:
      for rate in rates:
        mlp = MLPClassifier(hidden_layer_sizes=layer, activation=active, learning_rate=rate)
        mlp.fit(X_train, y_train)
        val_acc = mlp.score(X_val, y_val)

        accuracies += val_acc
        counter += 1
 
        if val_acc > best_val_acc:
          best_val_acc = val_acc
          best_layer = [layer]
          best_active = [active]
          best_rate = [rate]
        elif val_acc == best_val_acc:
          best_layer.append(layer)
          best_active.append(active)
          best_rate.append(rate)
        
        print('Accuracy:', val_acc, ',', 'Layer:', layer, ',',
              'Activation:', active, ',', 'Rate:', rate)
  
  mean_accuracy = accuracies/counter
  print('Best Accuracy:', best_val_acc)
  print('Layer', best_layer)
  print('Activation:', best_active)
  print('Rate:', best_rate)
  print('Mean Accuracy:', mean_accuracy)

  return best_layer, best_active, best_rate

In [0]:
best_layer, best_active, best_rate = trainModel(X_train.astype(np.float32), 
                                         X_val.astype(np.float32), 
                                         y_train.astype(np.float32), 
                                         y_val.astype(np.float32), 
                                         [(10,), (10, 10), (50, 50), 
                                          (10, 10, 10), (50, 50, 50)], 
                                          ['relu', 'tanh'], 
                                         ['constant', 'adaptive'])

Accuracy: 0.16043613707165108 , Layer: (10,) , Activation: relu , Rate: constant
Accuracy: 0.117601246105919 , Layer: (10,) , Activation: relu , Rate: adaptive
Accuracy: 0.13161993769470404 , Layer: (10,) , Activation: tanh , Rate: constant
Accuracy: 0.1394080996884735 , Layer: (10,) , Activation: tanh , Rate: adaptive
Accuracy: 0.12694704049844235 , Layer: (10, 10) , Activation: relu , Rate: constant
Accuracy: 0.11370716510903427 , Layer: (10, 10) , Activation: relu , Rate: adaptive
Accuracy: 0.13161993769470404 , Layer: (10, 10) , Activation: tanh , Rate: constant
Accuracy: 0.1440809968847352 , Layer: (10, 10) , Activation: tanh , Rate: adaptive
Accuracy: 0.11448598130841121 , Layer: (50, 50) , Activation: relu , Rate: constant
Accuracy: 0.12227414330218069 , Layer: (50, 50) , Activation: relu , Rate: adaptive
Accuracy: 0.1277258566978193 , Layer: (50, 50) , Activation: tanh , Rate: constant
Accuracy: 0.13629283489096572 , Layer: (50, 50) , Activation: tanh , Rate: adaptive
Accuracy:

In [0]:
del X_val
del y_val

In [0]:
X_test, y_test = runTestDataset(cv_statement, cv_context)
del cv_statement
del cv_context

In [0]:
def testModel(X_train, X_test, y_train, y_test, best_layer, best_active, best_rate):
  mlp = MLPClassifier(hidden_layer_sizes=best_layer, activation=best_active, learning_rate=best_rate)
  mlp.fit(X_train, y_train)
  test_acc = mlp.score(X_test, y_test)
  print('Test accuracy:', test_acc)

In [0]:
testModel(X_train.astype(np.float32), X_test.astype(np.float32), 
        y_train.astype(np.float32), y_test.astype(np.float32), best_layer[0], 
        best_active[0], best_rate[0])

Test accuracy: 0.19179163378058406
