# Predictions

Do you have any hypothesis?
Can you make any kind of prediction: regression and/or classification?


## Step 1: Import the cleaned data

In [1]:
import pandas as pd
import numpy as np

In [2]:
data2 = pd.read_csv('DataChallenge_cleanedData.csv')

In [3]:
data2.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,rain or snow,sun
0,28.0,5.0,26,21.5,12.0,E10,0,0,0
1,12.0,4.2,30,21.5,13.0,E10,0,0,0
2,11.2,5.5,38,21.5,15.0,E10,0,0,0
3,12.9,3.9,36,21.5,14.0,E10,0,0,0
4,18.5,4.5,46,21.5,15.0,E10,0,0,0


## Step 2: Building hypothesisis

### Nr. 1: The lower the average speed, the higher the consume.

### Nr. 2: There is a higher consume on rainy or snowy rides.

### Nr. 3: There is more consume in cold or warm weather.

### Nr. 4: The gas_type has no big influence on the consume.

## Step 3: Preparing data for prediction modells:

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

### x/y-Split:

In [None]:
# As we are interested in the most combustile fuel, we will analyze consume as y:

In [5]:
y = data2['consume']

X = data2.drop(['consume'], axis=1)

### train/test-Split: 

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 666)

In [8]:
X_train_cat = X_train.select_dtypes(include = object)
X_test_cat = X_test.select_dtypes(include = object)
X_train_num = X_train.select_dtypes(include = np.number)
X_test_num = X_test.select_dtypes(include = np.number)

### Min-Max-Scaler & OneHot-Encoder:

In [11]:
scaler = MinMaxScaler().fit(X_train_num)
encoder = OneHotEncoder().fit(X_train_cat)

In [12]:
def transform(dataframe):
    # This function assumes that you have initialized a scaler and an encoder, based on the training data
    cat = dataframe.select_dtypes(include = object)
    num = dataframe.select_dtypes(include = np.number)

    if not cat.empty:
        cat_transformed = encoder.transform(cat)
        cat_transformed = cat_transformed.toarray()
        cat_new = pd.DataFrame(data = cat_transformed, columns = encoder.get_feature_names_out())
    else: 
        cat_new = cat
    
    if not num.empty:
        num_transformed = scaler.transform(num)
        num_new = pd.DataFrame(data = num_transformed, columns = num.columns)
    else:
        num_new = num
    
    return pd.concat([cat_new, num_new], axis = 1)

In [13]:
X_train = transform(X_train)
X_test = transform(X_test)

In [14]:
X_train

Unnamed: 0,gas_type_E10,gas_type_SP98,distance,speed,temp_inside,temp_outside,AC,rain or snow,sun
0,0.0,1.0,0.128026,0.276316,0.461538,0.555556,0.0,0.0,1.0
1,0.0,1.0,0.061453,0.289474,0.923077,0.750000,0.0,0.0,0.0
2,1.0,0.0,0.051210,0.342105,0.384615,0.305556,0.0,0.0,0.0
3,0.0,1.0,0.071229,0.460526,0.461538,0.861111,0.0,0.0,1.0
4,0.0,1.0,0.048883,0.328947,0.384615,0.305556,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
305,0.0,1.0,0.050745,0.605263,0.153846,0.444444,0.0,0.0,0.0
306,0.0,1.0,0.051210,0.539474,0.384615,0.472222,0.0,0.0,0.0
307,0.0,1.0,0.976257,0.868421,0.461538,0.694444,0.0,0.0,0.0
308,0.0,1.0,0.065642,0.368421,0.923077,1.000000,1.0,0.0,0.0


## Step 4: Building prediction models

In [16]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.utils import resample
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from random import randint

pd.set_option('display.max_columns', None)

In [18]:
from sklearn.linear_model import LinearRegression

### Linear regression

In [19]:
regr = LinearRegression()
regr.fit(X_train, y_train)

In [20]:
regr.score(X_train, y_train)

0.1753579364097132

In [21]:
regr.score(X_test, y_test)

0.24693623388599928

In [None]:
# Linear Regression Model is not helpful - we have to try others:

In [None]:
### 

## Step 5: Interpret prediction models

In [None]:
# 

## Step 6: Saving Dataframe in .csv:

In [None]:
#data3.to_csv('DataChallenge_PreditionModels.csv', index=False)