# Car Fuel Consumption

## データセットの確認

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as  sns

### データセットの読込み

CSVファイルは小数点のカンマ -> ピリオド変換のやり方がわからないためExcelのデータを読込む。

In [2]:
# Excelファイルの読込み
df = pd.read_excel('./input/measurements2.xlsx')

In [3]:
# レコード数確認
len(df)

388

In [4]:
# データの概要
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
distance         388 non-null float64
consume          388 non-null float64
speed            388 non-null int64
temp_inside      376 non-null float64
temp_outside     388 non-null int64
specials         93 non-null object
gas_type         388 non-null object
AC               388 non-null int64
rain             388 non-null int64
sun              388 non-null int64
refill liters    13 non-null float64
refill gas       13 non-null object
dtypes: float64(4), int64(5), object(3)
memory usage: 36.5+ KB


#### 欠損値の補完

temp_insideに若干数の欠損値があるため、平均or中央値or最頻値で補完する。  
specials、refill liters、refill gasは欠損値が多すぎるため諦める。

In [5]:
# データの概要確認
df.describe()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,AC,rain,sun,refill liters
count,388.0,388.0,388.0,376.0,388.0,388.0,388.0,388.0,13.0
mean,19.652835,4.912371,41.927835,21.929521,11.358247,0.07732,0.123711,0.082474,37.115385
std,22.667837,1.033172,13.598524,1.010455,6.991542,0.267443,0.329677,0.275441,8.587282
min,1.3,3.3,14.0,19.0,-5.0,0.0,0.0,0.0,10.0
25%,11.8,4.3,32.75,21.5,7.0,0.0,0.0,0.0,37.6
50%,14.6,4.7,40.5,22.0,10.0,0.0,0.0,0.0,38.0
75%,19.0,5.3,50.0,22.5,16.0,0.0,0.0,0.0,39.0
max,216.1,12.2,90.0,25.5,31.0,1.0,1.0,1.0,45.0


In [6]:
# temp_insideの75%は２２℃近辺なので平均値の約21.9℃で欠損値を補完する。
df_copy = df.copy()
df_copy.temp_inside = df_copy.temp_inside.fillna(df_copy.temp_inside.mean())

In [7]:
# 補完後の確認
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
distance         388 non-null float64
consume          388 non-null float64
speed            388 non-null int64
temp_inside      388 non-null float64
temp_outside     388 non-null int64
specials         93 non-null object
gas_type         388 non-null object
AC               388 non-null int64
rain             388 non-null int64
sun              388 non-null int64
refill liters    13 non-null float64
refill gas       13 non-null object
dtypes: float64(4), int64(5), object(3)
memory usage: 36.5+ KB


### gas_typeごとのデータ分離

下記のように置き換えた後、gas_typeごとのデータに分離する。

- E10 → 1
- SP98 → 2

In [9]:
# 置き換え前
df_gastype_replace = df_copy.copy()
df_gastype_replace.groupby('gas_type').size()

gas_type
E10     160
SP98    228
dtype: int64

In [10]:
# ガスの種類を二値で判別（E10=1,SP98=2）
df_gastype_replace = df_gastype_replace.replace("E10",1)
df_gastype_replace = df_gastype_replace.replace("SP98",2)

In [11]:
# 置き換え後
df_gastype_replace.groupby('gas_type').size()

gas_type
1    160
2    228
dtype: int64

In [12]:
# gas_typeごとのデータに分離
E10 = df_gastype_replace[df_gastype_replace.gas_type == 1][["distance","consume","speed","temp_inside","temp_outside","rain","sun","AC"]].copy()
SP98 = df_gastype_replace[df_gastype_replace.gas_type == 2][["distance","consume","speed","temp_inside","temp_outside","rain","sun","AC"]].copy()

In [13]:
E10.describe()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,rain,sun,AC
count,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0
mean,21.09625,4.93125,43.50625,21.917429,10.11875,0.1,0.075,0.04375
std,20.307234,0.900956,14.077949,0.653602,6.392185,0.300942,0.264218,0.205181
min,1.7,3.7,14.0,21.0,-5.0,0.0,0.0,0.0
25%,12.075,4.4,35.0,21.5,6.0,0.0,0.0,0.0
50%,15.4,4.8,42.0,21.5,9.0,0.0,0.0,0.0
75%,21.2,5.3,51.0,22.5,14.25,0.0,0.0,0.0
max,130.3,10.8,88.0,25.0,27.0,1.0,1.0,1.0


In [14]:
SP98.describe()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,rain,sun,AC
count,228.0,228.0,228.0,228.0,228.0,228.0,228.0,228.0
mean,18.639912,4.899123,40.820175,21.938007,12.22807,0.140351,0.087719,0.100877
std,24.179598,1.118408,13.170122,1.17784,7.271373,0.348115,0.283509,0.301829
min,1.3,3.3,16.0,19.0,-3.0,0.0,0.0,0.0
25%,11.8,4.2,32.0,21.5,7.0,0.0,0.0,0.0
50%,14.15,4.7,39.5,22.0,11.0,0.0,0.0,0.0
75%,18.15,5.225,48.0,22.0,17.0,0.0,0.0,0.0
max,216.1,12.2,90.0,25.5,31.0,1.0,1.0,1.0


## scikit-learnで線形回帰

### 説明変数

- distance(距離: km)
- speed(速度: km/h)
- temp_inside(車内気温: ℃)
- temp_outside(車外気温: ℃)
- rain(雨の日)
- sun(晴れの日)
- AC(エアコンOn/Off)

### 目的変数

- consume(燃費: L/100km)

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
y = E10['consume']

In [17]:
X = pd.concat([E10.iloc[:, 0], E10.iloc[:, 2:]], axis=1)

In [18]:
regr_e10 = LinearRegression(fit_intercept=True)
regr_e10.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
# スコアの検証
regr_e10.score(X, y)

0.20031923525056605

In [20]:
# w0の確認
w0 = regr_e10.intercept_
w0

3.8772692201893584

In [21]:
# w1 〜 w7の確認
np.set_printoptions(precision=4, suppress=True)
W = regr_e10.coef_
W

array([ 0.0006, -0.0133,  0.0913, -0.0419,  0.6229, -0.2258,  0.02  ])

とりあえず、E10のデータで学習まではやってみた。
２乗誤差は出し方が分からないため、一旦保留。

スコアの検証で0.2と、ほとんど学習できていない。Weightの値を見ると、本来使って欲しい距離やスピードのパラメータはほとんど結果に影響しておらず、雨・晴れの影響が強くなっている。
雨・晴れのデータを除けばもう少しまともになるかもしれない。

In [25]:
# 雨・晴れを除く
X = pd.concat([E10.iloc[:, 0], E10.iloc[:, 2:-3]], axis=1)

In [26]:
regr_e10_2 = LinearRegression(fit_intercept=True)
regr_e10_2.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [28]:
# スコアの検証
regr_e10_2.score(X, y)

0.1509778883995503

In [30]:
W = regr_e10_2.coef_
W

array([-0.0001, -0.013 ,  0.105 , -0.0473])

雨・晴れの影響以前に全く学習ができていない。データ云々ではなく、やり方を根本的に見直す必要あり。