In [0]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as prep
import sklearn.pipeline as pl
import sklearn.svm as sv
import sklearn.neighbors as nei
import sklearn.metrics as mt
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
import warnings
import sklearn.neural_network as nn
from sklearn.linear_model import RANSACRegressor
warnings.simplefilter("ignore")

In [0]:
# If you haven't mounted your drive yet, do so by running this code.
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Set the current working directory
data_cwd = 'drive/My Drive/data mining project/data by sector/'
unlabeled_cwd = 'drive/My Drive/data mining project/unlabeled data/'
rlp_result_cwd = 'drive/My Drive/data mining project/rlp_regression_result/'
ran_result_cwd = 'drive/My Drive/data mining project/RANSAC_regression_result/'
sector_list = pd.read_csv('drive/My Drive/data mining project/sector_list.csv', header=None)

# Pre-Process

In [0]:
def removeNaN(df, has_label):
  df = df.dropna(axis=1)
  if has_label:
    df = df[np.abs(df['label'] - df['label'].mean()) < 3 * df['label'].std()]
  df = df.drop(labels=['symbol', 'industry', 'period_end_date'], axis=1)
  return df

def split_dataset(df):
  labels = df['label']
  features = df.drop('label', 1)
  return labels, features

def normalize(feature):
  std_scaler = prep.StandardScaler(with_mean=True)
  std_scaler.fit(feature)
  return pd.DataFrame(std_scaler.transform(feature)), std_scaler

# Training Regressors

In [0]:
def nn_train(features, labels):
    mlp = nn.MLPRegressor()
    mlp_grid = {
      'hidden_layer_sizes': [(30,),(40,),(50,),(60,)],
      'activation': ['logistic', 'tanh', 'relu']
    }
    mlp_clf = ms.GridSearchCV(cv = 5, param_grid = mlp_grid, estimator = mlp)
    
    mlp_clf = mlp_clf.fit(features, labels)
    mlp = nn.MLPRegressor(hidden_layer_sizes = mlp_clf.best_params_['hidden_layer_sizes'][0],
                     activation = mlp_clf.best_params_['activation'])
    
    # cross validation to find the mean square error
    size = len(labels) // 5
    score = 0
    for i in range(0, 5):
      test_set = features[i * size: (i+1) * size]
      train_set = pd.concat([features[: i*size], features[(i+1)*size :]])
      test_label = labels[i * size: (i+1) * size]
      train_label = pd.concat([labels[: i*size], labels[(i+1)*size :]])

      rnr_clf = mlp.fit(train_set, train_label)
      pred = np.nan_to_num(rnr_clf.predict(test_set))
      score += mt.mean_squared_error(test_label, pred)

    print("MLP MSE is: " + str(score/5))
    mlp = mlp.fit(features, labels)
    
    return score/5, mlp

def RANSAC_train(features, labels):
    reg = RANSACRegressor(random_state=0)
    reg = RANSACRegressor(random_state=0).fit(features, labels)
    scores = ms.cross_val_score(reg, features, labels, cv = 5, scoring ='neg_mean_squared_error')
    print("RANSAC MSE is: " + str(-scores.mean()))
    return -scores.mean(), reg


# Main

In [0]:
def build_classifier(file_name):
  data = pd.read_csv(data_cwd + file_name + ".csv")
  
  company = removeNaN(data, True)
  label, feature = split_dataset(company)
  feature, std_scl = normalize(feature)
  rlp_acc, rlp_clf = nn_train(feature, label)
  ran_acc, ran_clf = RANSAC_train(feature, label)

  return rlp_clf, ran_clf, rlp_acc, ran_acc, std_scl


def predict_data(file_name, clf, std_scl):
  unlabeled = pd.read_csv(unlabeled_cwd + file_name + ".csv")
  feature = removeNaN(unlabeled, False)
  feature = std_scl.transform(feature)
  result = clf.predict(feature)
  result = np.nan_to_num(result)
  return result

In [0]:
rlp_mse = np.array([])
ran_mse = np.array([])

In [0]:
for industry in sector_list.iterrows():
  ind_name = industry[1][0]
  print(ind_name)
  rlp_clf, ran_clf, rlp_acc, ran_acc, std_scl = build_classifier(ind_name)
  rlp_result = predict_data(ind_name, rlp_clf, std_scl)
  ran_result = predict_data(ind_name, ran_clf, std_scl)

  rlp_mse = np.append(rlp_mse, rlp_acc)
  ran_mse = np.append(ran_mse, ran_acc)

  pd.DataFrame(rlp_result).to_csv(rlp_result_cwd + ind_name + ".csv", index=False, header=False)
  pd.DataFrame(ran_result).to_csv(ran_result_cwd + ind_name + ".csv", index=False, header=False)

Accomodation & Food Services
MLP MSE is: 4.144032849978497
RANSAC MSE is: 1142224825.8169
Arts & Recreation
MLP MSE is: 9.215336876834124
RANSAC MSE is: 344.155481298929
Asset Management & Securities Brokerage
MLP MSE is: 2.0740303040869392
RANSAC MSE is: 7.284573829031556
Banks
MLP MSE is: 1.0330390269939942
RANSAC MSE is: 18.575906068442276
Business Support Services
MLP MSE is: 11.173766810386786
RANSAC MSE is: 30.83531437646348
Chemical Products
MLP MSE is: 22.618796155529992
RANSAC MSE is: 50.430761555939185
Computer Hardware & Electronics
MLP MSE is: 6.31772071005526
RANSAC MSE is: 30.169164685941677
Conglomerates
MLP MSE is: 2.8756493335018742
RANSAC MSE is: 6.600961198939935
Construction
MLP MSE is: 2.025027765660385
RANSAC MSE is: 3.8975907595319903
Consumer Services
MLP MSE is: 53.5210208993273
RANSAC MSE is: 120.60711744270354
Educational Services
MLP MSE is: 4.0153051473425005
RANSAC MSE is: 50.723654407710896
Food, Beverages, & Tobacco Manufacturing
MLP MSE is: 3.3206533599

In [0]:
print(rlp_mse)
print(ran_mse)
pd.DataFrame(rlp_mse).to_csv(rlp_result_cwd + "mse.csv", index=False, header = False)
pd.DataFrame(ran_mse).to_csv(ran_result_cwd + "mse.csv", index=False, header = False)

[  4.14403285   9.21533688   2.0740303    1.03303903  11.17376681
  22.61879616   6.31772071   2.87564933   2.02502777  53.5210209
   4.01530515   3.32065336   2.50108001   5.59382699   1.36993493
  18.1192576    8.05356912  10.70999543   7.41809816  32.48877335
  46.88190395   1.21571582  39.38473173   7.94170246 203.40204828
  11.30838584  18.84414242   5.31592821   5.69183829   9.79357668]
[1.14222483e+09 3.44155481e+02 7.28457383e+00 1.85759061e+01
 3.08353144e+01 5.04307616e+01 3.01691647e+01 6.60096120e+00
 3.89759076e+00 1.20607117e+02 5.07236544e+01 4.30329331e+01
 8.80295073e+00 3.84167281e+01 4.07319960e+00 1.28007676e+04
 3.32592481e+01 1.20814933e+03 1.98130893e+01 3.41326939e+01
 2.05289407e+04 1.87026045e+00 8.56509267e+03 1.78502688e+02
 1.89849344e+05 1.32378148e+01 9.17760374e+01 1.97061683e+01
 2.20368506e+01 1.08280838e+02]
