In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_boston

In [2]:
# This problem has the following inputs: 
# 1. Frequency, in Hertzs. 
# 2. Angle of attack, in degrees. 
# 3. Chord length, in meters. 
# 4. Free-stream velocity, in meters per second. 
# 5. Suction side displacement thickness, in meters. 

# The only output is: 
# 6. Scaled sound pressure level, in decibels. 

In [3]:
feature_names = ['Frequency','Angle','Chord Length','velocity','displacement thickness','sound pressure']
airfoil = pd.read_csv("./airfoil_self_noise.csv",header=None,sep='\t',names=feature_names)

In [4]:
airfoil.head()

Unnamed: 0,Frequency,Angle,Chord Length,velocity,displacement thickness,sound pressure
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [5]:
airfoil = pd.DataFrame(airfoil)
airfoil.head()

Unnamed: 0,Frequency,Angle,Chord Length,velocity,displacement thickness,sound pressure
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [98]:
features = airfoil.loc[:,:'displacement thickness']
features.head()

Unnamed: 0,Frequency,Angle,Chord Length,velocity,displacement thickness
0,800,0.0,0.3048,71.3,0.002663
1,1000,0.0,0.3048,71.3,0.002663
2,1250,0.0,0.3048,71.3,0.002663
3,1600,0.0,0.3048,71.3,0.002663
4,2000,0.0,0.3048,71.3,0.002663


In [46]:
frequency = pd.Series(airfoil['Frequency'])
frequency.value_counts().sort_values()

20000      6
16000     13
12500     25
200       35
250       42
10000     42
8000      52
315       56
400       69
500       78
630       88
6300      89
5000      95
800       97
1000      99
1250     100
4000     102
1600     103
3150     103
2500     104
2000     105
Name: Frequency, dtype: int64

In [52]:
data = pd.DataFrame(airfoil)
data.nunique()  # no. of unique values of each feature

Frequency                   21
Angle                       27
Chord Length                 6
velocity                     4
displacement thickness     105
sound pressure            1456
dtype: int64

In [36]:
corelation = airfoil.corr('pearson') 

In [37]:
corelation['sound pressure'].sort_values()

Frequency                -0.390711
displacement thickness   -0.312670
Chord Length             -0.236162
Angle                    -0.156108
velocity                  0.125103
sound pressure            1.000000
Name: sound pressure, dtype: float64

In [82]:
from sklearn.preprocessing import MinMaxScaler

In [83]:
scaler = MinMaxScaler()

In [84]:
y = y.reshape(-1,1)

In [85]:
y = scaler.fit_transform(y)

In [86]:
from sklearn.model_selection import train_test_split 

In [87]:
x_train,x_test,y_train,y_test = train_test_split(features,y,test_size=0.2)

In [88]:
from sklearn.linear_model import LinearRegression

In [89]:
regressor = LinearRegression() 

In [90]:
regressor.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [91]:
y_pred = regressor.predict(x_test)

In [92]:
for i in zip(y_pred,y_test):
    print(i)

(array([0.53913906]), array([0.73414524]))
(array([0.70083011]), array([0.81825192]))
(array([0.45201945]), array([0.36128912]))
(array([0.61931678]), array([0.73284229]))
(array([0.13539038]), array([0.14202143]))
(array([0.73521333]), array([0.36014572]))
(array([0.52350109]), array([0.52798681]))
(array([0.52041159]), array([0.73172548]))
(array([0.74192864]), array([0.8113915]))
(array([0.50765581]), array([0.55399261]))
(array([0.68270679]), array([0.66250964]))
(array([0.60878935]), array([0.59855878]))
(array([0.59902032]), array([0.68136251]))
(array([0.37932208]), array([0.24779961]))
(array([0.5860363]), array([0.7348366]))
(array([0.28303949]), array([0.19336826]))
(array([0.65918722]), array([0.72223256]))
(array([0.63601863]), array([0.65025128]))
(array([0.49598476]), array([0.49044061]))
(array([0.543628]), array([0.59390539]))
(array([0.75473053]), array([0.77538756]))
(array([0.43916845]), array([0.38543356]))
(array([0.52239577]), array([0.59164517]))
(array([0.578221

In [93]:
from sklearn.metrics import mean_squared_error, r2_score

In [94]:
mean_squared_error(y_test,y_pred)

0.015313633147768934

In [95]:
r2_score(y_test,y_pred)   # it tells that if there are 100 pts then 56 pts are fit correctly

0.5638783237129562

In [96]:
1-0.03092960700315367

0.9690703929968463