In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
import sklearn.model_selection as ms
import sklearn.metrics as mt
import warnings
warnings.simplefilter("ignore")

In [0]:
# If you haven't mounted your drive yet, do so by running this code.
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········


In [0]:
# Set the current working directory
cwd = 'drive/My Drive/data mining project/'

# Preprocessing

In [0]:
def removeNaN(df, has_label):
  df = df.dropna(axis=1)
  if has_label:
    df = df[np.abs(df['label'] - df['label'].mean()) < 3 * df['label'].std()]
  df = df.drop(labels=['symbol','industry', 'period_end_date', 'quarter1', 'quarter2', 'quarter3', 'quarter4'], axis=1)
  return df

In [0]:
import sklearn.preprocessing as prep
def standardize(df, has_label):
  if has_label:
    attributes = df.drop(columns=['label']).columns
  else:
    attributes = df.columns
  df[attributes] = prep.StandardScaler().fit_transform(df[attributes])
  return df


# Linear Regression


In [0]:
def linear_reg(X, y):
  reg = LinearRegression()
  
  # cross validation to find average mean square error
  scores = ms.cross_val_score(reg, X, y, scoring='neg_mean_squared_error', cv=5)
  print("Linear Regression MSE = ", -scores.mean())

  #build regressor
  linear_clf = LinearRegression()
  linear_clf = linear_clf.fit(X, y)

  return -scores.mean(), linear_clf

# Lasso Regression

In [0]:
def lasso_reg(X, y):
  lasso = Lasso()
  parameters = {
      'alpha': list(np.arange(.01, .09, .01))
  }
  # determine the best parameter
  lasso_grid = ms.GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv=5)
  lasso_grid = lasso_grid.fit(X, y)
  print(lasso_grid.best_params_)

  # cross validation to find average mean square error
  scores = ms.cross_val_score(lasso, X, y, cv=5, scoring='neg_mean_squared_error')
  print("Lasso Regression MSE = ", -scores.mean())

  # build the regressor
  lasso_clf = Lasso(alpha=lasso_grid.best_params_['alpha'])
  lasso_clf = lasso_clf.fit(X, y)
  return -scores.mean(), lasso_clf

#Main

## Predict on companies

In [0]:
def build_classifier(data):
  data = removeNaN(data, True)
  data = standardize(data, True)
  X = data.drop(columns=['label'])
  y = data['label'].values.reshape(-1,1)

  lin_mse, lin_regr = linear_reg(X, y)
  lasso_mse, lasso_regr = lasso_reg(X, y)

  return lin_regr, lasso_regr, lin_mse, lasso_mse

def predict_data(unlabeled_data, clf):
  unlabeled_data = removeNaN(unlabeled_data, False)
  unlabeled_data = standardize(unlabeled_data, False)
  y_pred = clf.predict(unlabeled_data)
  return y_pred

## read by industries and store result

In [0]:
lin_mse_all = np.array([])
lasso_mse_all = np.array([])

industries = pd.read_csv(cwd+'sector_list.csv', header=None)
cwd2 = 'drive/My Drive/data mining project/data by sector/'
cwd3 = 'drive/My Drive/data mining project/unlabeled data/'
for i,row in industries.iterrows():
  industry = row[0]
  print(industry)
  data = pd.read_csv(cwd2 + industry + '.csv')
  unlabeled_data = pd.read_csv(cwd3 + industry + '.csv')
  lin_regr, lasso_regr, lin_mse, lasso_mse = build_classifier(data)
  lin_pred = predict_data(unlabeled_data, lin_regr)
  lasso_pred = predict_data(unlabeled_data, lasso_regr)
  unlabeled_data['lin_pred'] = lin_pred
  unlabeled_data['lasso_pred'] = lasso_pred
  lin_mse_all = np.append(lin_mse_all, lin_mse)
  lasso_mse_all = np.append(lasso_mse_all, lasso_mse)
  lin_df = pd.DataFrame(lin_pred)
  lasso_df = pd.DataFrame(lasso_pred)
  lin_df.to_csv('drive/My Drive/data mining project/linear_regression_result/' + industry + ".csv", header=None, index=False)
  lasso_df.to_csv('drive/My Drive/data mining project/lasso_regression_result/' + industry + ".csv", header=None, index=False)

Accomodation & Food Services
Linear Regression MSE =  1310584.7869188457
{'alpha': 0.08}
Lasso Regression MSE =  4.078019653029536
Arts & Recreation
Linear Regression MSE =  23.66479404870617
{'alpha': 0.08}
Lasso Regression MSE =  6.663651281517934
Asset Management & Securities Brokerage
Linear Regression MSE =  2.118692350819923
{'alpha': 0.05}
Lasso Regression MSE =  2.0825876679083235
Banks
Linear Regression MSE =  1.2299173367074467
{'alpha': 0.08}
Lasso Regression MSE =  1.026838955688444
Business Support Services
Linear Regression MSE =  11.231194778278184
{'alpha': 0.02}
Lasso Regression MSE =  11.14824239822111
Chemical Products
Linear Regression MSE =  23.69776317205442
{'alpha': 0.08}
Lasso Regression MSE =  22.846071843610112
Computer Hardware & Electronics
Linear Regression MSE =  6.746068333559758
{'alpha': 0.08}
Lasso Regression MSE =  6.274309715637725
Conglomerates
Linear Regression MSE =  4.054274301302574
{'alpha': 0.08}
Lasso Regression MSE =  2.5701522558458847
Con

In [0]:
pd.DataFrame(lin_mse_all).to_csv('drive/My Drive/data mining project/linear_regression_result/' + "mse.csv", index=False, header = False)
pd.DataFrame(lasso_mse_all).to_csv('drive/My Drive/data mining project/lasso_regression_result/' + "mse.csv", index=False, header = False)