In [1]:
#imports
import numpy as np
import pandas as pd
import math

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
%matplotlib inline

In [2]:
#import training dataset
syndrome_df = pd.read_csv('top_20_genes_expression.csv')

In [3]:
#see the columns in our data
syndrome_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ILMN_1662358  14 non-null     float64
 1   ILMN_1685540  14 non-null     float64
 2   ILMN_1687384  14 non-null     float64
 3   ILMN_1690099  14 non-null     float64
 4   ILMN_1704418  14 non-null     float64
 5   ILMN_1712487  14 non-null     float64
 6   ILMN_1728734  14 non-null     float64
 7   ILMN_1745374  14 non-null     float64
 8   ILMN_1757467  14 non-null     float64
 9   ILMN_1760620  14 non-null     float64
 10  ILMN_1769615  14 non-null     float64
 11  ILMN_1774949  14 non-null     float64
 12  ILMN_1798181  14 non-null     float64
 13  ILMN_1801776  14 non-null     float64
 14  ILMN_1805175  14 non-null     float64
 15  ILMN_1805750  14 non-null     float64
 16  ILMN_2054019  14 non-null     float64
 17  ILMN_2181892  14 non-null     float64
 18  ILMN_2201596  14 non-null     fl

In [4]:
# take a look at the head of the dataset
syndrome_df.head()

Unnamed: 0,ILMN_1662358,ILMN_1685540,ILMN_1687384,ILMN_1690099,ILMN_1704418,ILMN_1712487,ILMN_1728734,ILMN_1745374,ILMN_1757467,ILMN_1760620,...,ILMN_1774949,ILMN_1798181,ILMN_1801776,ILMN_1805175,ILMN_1805750,ILMN_2054019,ILMN_2181892,ILMN_2201596,ILMN_2276820,PLT
0,440.42755,525.269314,2492.849357,2653.352071,2375.040643,1039.102179,656.231493,587.19835,1488.201557,347.382243,...,248.832407,349.044629,209.399014,193.635107,8701.1,642.829393,208.466929,155.510807,142.155671,290
1,510.435114,577.859607,3141.378714,3259.436714,1710.490907,1230.415693,574.587571,555.729357,2355.029071,306.61825,...,214.441436,450.395529,231.068879,241.573386,10280.88243,1004.128979,234.981964,152.846629,127.772221,374
2,467.726371,470.346521,1755.240693,2341.233214,1989.525143,1104.4512,1135.543036,518.044236,1559.063464,366.900314,...,261.733407,322.61545,308.672814,205.275879,8417.395429,832.241071,304.976979,148.313493,150.115,240
3,280.692964,707.631393,2001.740286,2357.522429,2534.718786,1048.001993,1518.206343,571.006357,1798.38135,324.648479,...,224.984343,378.567543,313.980264,233.981871,12065.86279,584.28365,272.220436,132.022829,137.582207,246
4,189.075779,575.668521,2030.970286,2781.078286,2837.831286,1080.059579,1498.543486,485.782629,1679.023057,285.720321,...,232.087307,420.257843,271.28485,241.944029,10731.72357,915.9422,254.734064,178.543429,149.846657,184


In [5]:
#create  X and Y data
X = syndrome_df.drop('PLT', axis=1)
Y = syndrome_df['PLT']


In [6]:
#Classify the training data and test data
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42, test_size=0.3)

In [7]:
#Establish linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

print('Training score: {}'.format(lr_model.score(X_train, y_train)))
print('Test score: {}'.format(lr_model.score(X_test, y_test)))

y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)

print('RMSE: {}'.format(rmse))

Training score: 1.0
Test score: -1.0008439885411589
RMSE: 169.80130155542466


In [8]:
#as you can see the RMSE score is too high which means there is a problem in sample selection. We have to specifiy the sample selection.

X = syndrome_df

#Specify for control and syndrome sample training
X_control = X.loc[0:6]
X_control_training = X_control.sample(n = 5) 

X_syndrome = X.loc[7:14]
X_syndrome_training = X_syndrome.sample(n = 5) 

#Combine control and syndrome data for training
frames = [X_control_training, X_syndrome_training]
X_total_training = pd.concat(frames)
X_total_training

Unnamed: 0,ILMN_1662358,ILMN_1685540,ILMN_1687384,ILMN_1690099,ILMN_1704418,ILMN_1712487,ILMN_1728734,ILMN_1745374,ILMN_1757467,ILMN_1760620,...,ILMN_1774949,ILMN_1798181,ILMN_1801776,ILMN_1805175,ILMN_1805750,ILMN_2054019,ILMN_2181892,ILMN_2201596,ILMN_2276820,PLT
5,269.866193,536.05505,2819.840714,2392.155429,1937.387071,1190.025543,1499.021786,821.166429,2148.291214,276.945886,...,185.187614,395.124779,273.362864,199.399936,12599.83593,780.694207,247.2773,142.581,126.505571,409
2,467.726371,470.346521,1755.240693,2341.233214,1989.525143,1104.4512,1135.543036,518.044236,1559.063464,366.900314,...,261.733407,322.61545,308.672814,205.275879,8417.395429,832.241071,304.976979,148.313493,150.115,240
6,145.362664,632.463686,1982.083786,2448.656071,1563.030057,1100.630736,984.433757,585.829857,1763.009179,289.518693,...,183.370679,346.302107,230.33415,181.394543,9230.092,1375.515693,279.6932,168.808657,131.785964,171
1,510.435114,577.859607,3141.378714,3259.436714,1710.490907,1230.415693,574.587571,555.729357,2355.029071,306.61825,...,214.441436,450.395529,231.068879,241.573386,10280.88243,1004.128979,234.981964,152.846629,127.772221,374
0,440.42755,525.269314,2492.849357,2653.352071,2375.040643,1039.102179,656.231493,587.19835,1488.201557,347.382243,...,248.832407,349.044629,209.399014,193.635107,8701.1,642.829393,208.466929,155.510807,142.155671,290
7,1880.404714,435.104821,21830.94143,1732.040886,1660.361,1749.516671,314.926407,1094.693614,3180.090643,182.648807,...,148.990164,1074.012157,136.149686,178.525121,17150.39071,2680.923786,197.478557,185.678793,99.510888,100
12,4195.879143,430.907407,14089.36671,2296.963286,1290.030436,1307.919221,287.079364,1248.756007,2528.764714,222.054993,...,141.342786,1250.053843,131.621307,162.452471,21488.64857,5093.013143,159.458214,200.288593,108.803636,171
13,548.857293,427.764636,6161.997286,1422.378657,931.01205,1912.211714,183.884407,971.411936,4494.708786,151.934386,...,122.665729,1212.706214,106.804125,171.837393,15757.67143,1282.125329,215.028757,222.362464,108.366497,37
8,13079.83429,410.818229,28032.67786,1541.760593,1142.195486,1514.426421,162.508329,1695.243129,2869.222286,221.425086,...,119.553607,1266.8369,115.639707,160.642221,16481.19857,5650.854071,171.027386,183.18695,98.616018,29
9,978.166057,309.754871,3251.670929,1652.151921,867.41675,1576.595907,159.926714,1028.177114,3841.902214,230.470529,...,197.131721,472.199029,105.678987,163.082436,16254.05357,1641.955979,165.233421,182.264029,122.848836,53


In [13]:
#Specify for control and syndrome sample test
number = 4,5,7,13,14
X_total_test = X.iloc[[3,4,7,10], :]
X_total_test

Unnamed: 0,ILMN_1662358,ILMN_1685540,ILMN_1687384,ILMN_1690099,ILMN_1704418,ILMN_1712487,ILMN_1728734,ILMN_1745374,ILMN_1757467,ILMN_1760620,...,ILMN_1774949,ILMN_1798181,ILMN_1801776,ILMN_1805175,ILMN_1805750,ILMN_2054019,ILMN_2181892,ILMN_2201596,ILMN_2276820,PLT
3,280.692964,707.631393,2001.740286,2357.522429,2534.718786,1048.001993,1518.206343,571.006357,1798.38135,324.648479,...,224.984343,378.567543,313.980264,233.981871,12065.86279,584.28365,272.220436,132.022829,137.582207,246
4,189.075779,575.668521,2030.970286,2781.078286,2837.831286,1080.059579,1498.543486,485.782629,1679.023057,285.720321,...,232.087307,420.257843,271.28485,241.944029,10731.72357,915.9422,254.734064,178.543429,149.846657,184
7,1880.404714,435.104821,21830.94143,1732.040886,1660.361,1749.516671,314.926407,1094.693614,3180.090643,182.648807,...,148.990164,1074.012157,136.149686,178.525121,17150.39071,2680.923786,197.478557,185.678793,99.510888,100
10,1287.063036,409.577343,4728.476357,2007.385071,1420.320029,1347.204593,1005.144086,802.560686,3200.785071,220.41285,...,143.165529,657.791679,166.217321,165.876321,12024.75671,2206.815286,184.817293,204.334636,112.644836,108


In [58]:
#Define Y training and Y test
Y_train = X_total_training['PLT']
X_training = X_total_training.drop('PLT', axis=1)

Y_test = X_total_test['PLT']
X_test = X_total_test.drop('PLT', axis=1)

X_training

Unnamed: 0,ILMN_1662358,ILMN_1685540,ILMN_1687384,ILMN_1690099,ILMN_1704418,ILMN_1712487,ILMN_1728734,ILMN_1745374,ILMN_1757467,ILMN_1760620,ILMN_1769615,ILMN_1774949,ILMN_1798181,ILMN_1801776,ILMN_1805175,ILMN_1805750,ILMN_2054019,ILMN_2181892,ILMN_2201596,ILMN_2276820
5,269.866193,536.05505,2819.840714,2392.155429,1937.387071,1190.025543,1499.021786,821.166429,2148.291214,276.945886,408.552857,185.187614,395.124779,273.362864,199.399936,12599.83593,780.694207,247.2773,142.581,126.505571
2,467.726371,470.346521,1755.240693,2341.233214,1989.525143,1104.4512,1135.543036,518.044236,1559.063464,366.900314,645.356679,261.733407,322.61545,308.672814,205.275879,8417.395429,832.241071,304.976979,148.313493,150.115
6,145.362664,632.463686,1982.083786,2448.656071,1563.030057,1100.630736,984.433757,585.829857,1763.009179,289.518693,342.964886,183.370679,346.302107,230.33415,181.394543,9230.092,1375.515693,279.6932,168.808657,131.785964
1,510.435114,577.859607,3141.378714,3259.436714,1710.490907,1230.415693,574.587571,555.729357,2355.029071,306.61825,540.615029,214.441436,450.395529,231.068879,241.573386,10280.88243,1004.128979,234.981964,152.846629,127.772221
0,440.42755,525.269314,2492.849357,2653.352071,2375.040643,1039.102179,656.231493,587.19835,1488.201557,347.382243,541.601464,248.832407,349.044629,209.399014,193.635107,8701.1,642.829393,208.466929,155.510807,142.155671
7,1880.404714,435.104821,21830.94143,1732.040886,1660.361,1749.516671,314.926407,1094.693614,3180.090643,182.648807,858.513879,148.990164,1074.012157,136.149686,178.525121,17150.39071,2680.923786,197.478557,185.678793,99.510888
12,4195.879143,430.907407,14089.36671,2296.963286,1290.030436,1307.919221,287.079364,1248.756007,2528.764714,222.054993,1016.607493,141.342786,1250.053843,131.621307,162.452471,21488.64857,5093.013143,159.458214,200.288593,108.803636
13,548.857293,427.764636,6161.997286,1422.378657,931.01205,1912.211714,183.884407,971.411936,4494.708786,151.934386,603.257507,122.665729,1212.706214,106.804125,171.837393,15757.67143,1282.125329,215.028757,222.362464,108.366497
8,13079.83429,410.818229,28032.67786,1541.760593,1142.195486,1514.426421,162.508329,1695.243129,2869.222286,221.425086,786.613,119.553607,1266.8369,115.639707,160.642221,16481.19857,5650.854071,171.027386,183.18695,98.616018
9,978.166057,309.754871,3251.670929,1652.151921,867.41675,1576.595907,159.926714,1028.177114,3841.902214,230.470529,1008.07255,197.131721,472.199029,105.678987,163.082436,16254.05357,1641.955979,165.233421,182.264029,122.848836


In [59]:
#Establish linear regression model after specifying the samples
lr_model = LinearRegression()
lr_model.fit(X_training, Y_train)

print('Training score: {}'.format(lr_model.score(X_training, Y_train)))
print('Test score: {}'.format(lr_model.score(X_test, Y_test)))

y_pred = lr_model.predict(X_test)
mse = mean_squared_error(Y_test, y_pred)
rmse = math.sqrt(mse)

print('RMSE: {}'.format(rmse))

Training score: 1.0
Test score: -8.648008479813553
RMSE: 185.55681141454932


In [60]:
#Linear regression modelling

steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', LinearRegression())
]

pipeline = Pipeline(steps)

pipeline.fit(X_training, Y_train)

print('Training score: {}'.format(pipeline.score(X_training, Y_train)))
print('Test score: {}'.format(pipeline.score(X_test, Y_test)))

Training score: 1.0
Test score: -4.131619181166517


In [61]:
#Regularization using Ridge regression
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Ridge(alpha=10, fit_intercept=True))
]

ridge_pipe = Pipeline(steps)
ridge_pipe.fit(X_training, Y_train)

print('Training Score: {}'.format(ridge_pipe.score(X_training, Y_train)))
print('Test Score: {}'.format(ridge_pipe.score(X_test, Y_test)))


Training Score: 0.9931147971316276
Test Score: -3.1311936363627346


In [100]:
#Regularization using lasso regression
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Lasso(alpha=0.2, fit_intercept=True))
]

lasso_pipe = Pipeline(steps)

lasso_pipe.fit(X_train, y_train)

print('Training score: {}'.format(lasso_pipe.score(X_training, Y_train)))
print('Test score: {}'.format(lasso_pipe.score(X_test, Y_test)))

Training score: 0.6779071284439975
Test score: 0.9999911805316899


In [93]:
#Only use top 10 genes
Y_train_top10 = Y_train
X_training_top10 = X_total_training.iloc[:,0:10]

Y_test_top10 = Y_test
X_test_top10 = X_total_test.iloc[:,0:10]

X_test_top10
Y_test_top10

3     246
4     184
7     100
10    108
Name: PLT, dtype: int64

In [94]:
#Establish linear regression model after specifying the samples
lr_model = LinearRegression()
lr_model.fit(X_training_top10, Y_train_top10)

print('Training score: {}'.format(lr_model.score(X_training_top10, Y_train_top10)))
print('Test score: {}'.format(lr_model.score(X_test_top10, Y_test_top10)))

y_pred = lr_model.predict(X_test_top10)
mse = mean_squared_error(Y_test_top10, y_pred)
rmse = math.sqrt(mse)

print('RMSE: {}'.format(rmse))

Training score: 1.0
Test score: -10.008591773475175
RMSE: 198.209262880395


In [95]:
#Linear regression modelling

steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', LinearRegression())
]

pipeline = Pipeline(steps)

pipeline.fit(X_training_top10, Y_train_top10)

print('Training score: {}'.format(pipeline.score(X_training_top10, Y_train_top10)))
print('Test score: {}'.format(pipeline.score(X_test_top10, Y_test_top10)))

Training score: 1.0
Test score: -8.171745622679731


In [97]:
#Regularization using lasso regression
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Lasso(alpha=30, fit_intercept=True))
]

lasso_pipe = Pipeline(steps)

lasso_pipe.fit(X_training_top10, Y_train_top10)

print('Training score: {}'.format(lasso_pipe.score(X_training_top10, Y_train_top10)))
print('Test score: {}'.format(lasso_pipe.score(X_test_top10, Y_test_top10)))

Training score: 0.850457892157072
Test score: -2.0309782051684095


In [66]:
#Only use top 2 genes
Y_train_top10 = Y_train
X_training_top10 = X_total_training.iloc[:,0:2]

Y_test_top10 = Y_test
X_test_top10 = X_total_test.iloc[:,0:2]

X_training_top10

Unnamed: 0,ILMN_1662358,ILMN_1685540
5,269.866193,536.05505
2,467.726371,470.346521
6,145.362664,632.463686
1,510.435114,577.859607
0,440.42755,525.269314
7,1880.404714,435.104821
12,4195.879143,430.907407
13,548.857293,427.764636
8,13079.83429,410.818229
9,978.166057,309.754871


In [74]:
#Establish linear regression model after specifying the samples
lr_model = LinearRegression()
lr_model.fit(X_training_top10, Y_train_top10)

print('Training score: {}'.format(lr_model.score(X_training_top10, Y_train_top10)))
print('Test score: {}'.format(lr_model.score(X_test_top10, Y_test_top10)))

y_pred = lr_model.predict(X_test_top10)
mse = mean_squared_error(Y_test_top10, y_pred)
rmse = math.sqrt(mse)

print('RMSE: {}'.format(rmse))

Training score: 1.0
Test score: -10.008591773475175
RMSE: 198.209262880395


In [92]:
#Regularization using lasso regression
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Lasso(alpha=30, fit_intercept=True))
]

lasso_pipe = Pipeline(steps)

lasso_pipe.fit(X_training_top10, Y_train_top10)

print('Training score: {}'.format(lasso_pipe.score(X_training_top10, Y_train_top10)))
print('Test score: {}'.format(lasso_pipe.score(X_test_top10, Y_test_top10)))

Training score: 0.850457892157072
Test score: -2.0309782051684095


In [76]:
#Linear regression modelling
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', LinearRegression())
]

lasso_pipe = Pipeline(steps)

lasso_pipe.fit(X_training_top10, Y_train_top10)

print('Training score: {}'.format(lasso_pipe.score(X_training_top10, Y_train_top10)))
print('Test score: {}'.format(lasso_pipe.score(X_test_top10, Y_test_top10)))

Training score: 1.0
Test score: -8.171745622679731
