In [0]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as prep
import sklearn.pipeline as pl
import sklearn.svm as sv
import sklearn.neighbors as nei
import sklearn.metrics as mt
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")

In [0]:
# If you haven't mounted your drive yet, do so by running this code.
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Set the current working directory
data_cwd = 'drive/My Drive/data mining project/data by sector/'
unlabeled_cwd = 'drive/My Drive/data mining project/unlabeled data/'
svm_result_cwd = 'drive/My Drive/data mining project/sv_regression_result/'
rnr_result_cwd = 'drive/My Drive/data mining project/radius_nearest_neignbor_regression_result/'
sector_list = pd.read_csv('drive/My Drive/data mining project/sector_list.csv', header=None)

# Pre-processing

## remove columns that most values are NaN and remove outliers

In [0]:
def removeNaN(df, has_label):
  df = df.dropna(axis=1)
  if has_label:
    df = df[np.abs(df['label'] - df['label'].mean()) < 3 * df['label'].std()]
  df = df.drop(labels=['symbol', 'industry', 'period_end_date'], axis=1)
  return df


## separate labels and features

In [0]:
def split_dataset(df):
  labels = df['label']
  features = df.drop('label', 1)
  return labels, features

## normalize features

In [0]:
def normalize(feature):
  std_scaler = prep.StandardScaler(with_mean=True)
  std_scaler.fit(feature)
  return pd.DataFrame(std_scaler.transform(feature)), std_scaler

# SVM

In [0]:
def svm_train(feature, label):
  svm = sv.SVR()
  svm_grid = {
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': range(1, 5)
  }
  # determine the best parameter
  svm_grid = ms.GridSearchCV(cv = 5, param_grid = svm_grid, estimator = svm)
  svm_grid = svm_grid.fit(feature, label)
  print(svm_grid.best_params_)

  # cross validation to find average mean square error
  svm_acc = ms.cross_val_score(svm_grid, feature, label, cv=5, scoring='neg_mean_squared_error')
  print("SVR MSE = " + str(-svm_acc.mean()))

  # build the regressor
  svm_clf = sv.SVR(kernel=svm_grid.best_params_['kernel'], degree=svm_grid.best_params_['degree'])
  svm_clf = svm_clf.fit(feature, label)
  return -svm_acc.mean(), svm_clf

# Radius Neighbors Regressor

In [0]:
def rnr_train(feature, label):
  rnr = nei.RadiusNeighborsRegressor(radius=5)
  # cross validation to find the mean square error
  size = len(label) // 5
  score = 0
  for i in range(0, 5):
    test_set = feature[i * size: (i+1) * size]
    train_set = pd.concat([feature[: i*size], feature[(i+1)*size :]])
    test_label = label[i * size: (i+1) * size]
    train_label = pd.concat([label[: i*size], label[(i+1)*size :]])

    rnr_clf = rnr.fit(train_set, train_label)
    pred = np.nan_to_num(rnr_clf.predict(test_set))
    score += mt.mean_squared_error(test_label, pred)

  # create the regressor
  rnr = rnr.fit(feature, label)
  rnr_acc = score / 5
  print("RNR MSE = " + str(rnr_acc))
  return rnr_acc, rnr

# Main

## predict on companies

In [0]:
def build_classifier(file_name):
  data = pd.read_csv(data_cwd + file_name + ".csv")
  
  company = removeNaN(data, True)
  label, feature = split_dataset(company)
  feature, std_scl = normalize(feature)
  svm_acc, svm_clf = svm_train(feature, label)
  rnr_acc, rnr_clf = rnr_train(feature, label)

  return svm_clf, rnr_clf, svm_acc, rnr_acc, std_scl


def predict_data(file_name, clf, std_scl):
  unlabeled = pd.read_csv(unlabeled_cwd + file_name + ".csv")
  feature = removeNaN(unlabeled, False)
  feature = std_scl.transform(feature)
  result = clf.predict(feature)
  result = np.nan_to_num(result)
  return result

## read by industries, store the results

In [0]:
svm_mse = np.array([])
rnr_mse = np.array([])

In [0]:
for industry in sector_list.iterrows():
  ind_name = industry[1][0]
  print(ind_name)
  svm_clf, rnr_clf, svm_acc, rnr_acc, std_scl = build_classifier(ind_name)
  svm_result = predict_data(ind_name, svm_clf, std_scl)
  rnr_result = predict_data(ind_name, rnr_clf, std_scl)

  svm_mse = np.append(svm_mse, svm_acc)
  rnr_mse = np.append(rnr_mse, rnr_acc)

  pd.DataFrame(svm_result).to_csv(svm_result_cwd + ind_name + ".csv", index=False, header=False)
  pd.DataFrame(rnr_result).to_csv(rnr_result_cwd + ind_name + ".csv", index=False, header=False)



Accomodation & Food Services
{'degree': 1, 'kernel': 'rbf'}
SVR MSE = 4.017492500422887
RNR MSE = 4.0866832379690825
Arts & Recreation
{'degree': 1, 'kernel': 'rbf'}
SVR MSE = 6.615512126249724
RNR MSE = 6.635603665073818
Asset Management & Securities Brokerage
{'degree': 1, 'kernel': 'rbf'}
SVR MSE = 2.043164527396328
RNR MSE = 2.0836830396999075
Banks
{'degree': 1, 'kernel': 'rbf'}
SVR MSE = 1.0238550715837769
RNR MSE = 1.0252711217746224
Business Support Services
{'degree': 1, 'kernel': 'rbf'}
SVR MSE = 11.109322409556434
RNR MSE = 11.159389588402135
Chemical Products
{'degree': 1, 'kernel': 'rbf'}
SVR MSE = 22.673319427316095
RNR MSE = 22.97806703732274
Computer Hardware & Electronics
{'degree': 1, 'kernel': 'poly'}
SVR MSE = 6.259612067694475
RNR MSE = 6.417054894006529
Conglomerates
{'degree': 2, 'kernel': 'poly'}
SVR MSE = 2.3855597899310728
RNR MSE = 2.6282977704346573
Construction
{'degree': 1, 'kernel': 'poly'}
SVR MSE = 1.8477471420321432
RNR MSE = 1.912479542489002
Consumer

In [0]:
pd.DataFrame(svm_mse).to_csv(svm_result_cwd + "mse.csv", index=False, header = False)
pd.DataFrame(rnr_mse).to_csv(rnr_result_cwd + "mse.csv", index=False, header = False)

# Mean Square Error for naive prediction (average)

In [0]:
errors = []
for industry in sector_list.iterrows():
  ind_name = industry[1][0]
  d = pd.read_csv(data_cwd +ind_name + ".csv")
  res = d['label']
  avg = d['label'].mean()
  pre = pd.Series([avg for x in range(len(d.index))])
  errors.append([ind_name, mt.mean_squared_error(pre, res)])
df = pd.DataFrame(errors, columns = ['industry', 'mean sq error'])
df.head(30)

Unnamed: 0,industry,mean sq error
0,Accomodation & Food Services,35.357177
1,Arts & Recreation,405.041865
2,Asset Management & Securities Brokerage,30.017355
3,Banks,9.719772
4,Business Support Services,688.572714
5,Chemical Products,379.276387
6,Computer Hardware & Electronics,79.443579
7,Conglomerates,11.287251
8,Construction,15.845251
9,Consumer Services,318.958884


In [0]:
df.to_csv("drive/My Drive/data mining project/Initial_MSE.csv", index=False, header=False)