In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score , mean_absolute_error

In [17]:
train_df = pd.read_csv('data.csv')
test_df = pd.read_csv('test.csv')


In [18]:
def generate_index(list):
    return {value: key for key, value in enumerate(list, 1)}


def dropExtraColumns(data):
    columns = data.columns
    data.drop(columns[0], axis=1, inplace=True)
    #data.drop('Temperature in Montreal during episode', axis=1, inplace=True)
    data.drop('Name of episode', axis=1, inplace=True)
    data.drop('Name of show', axis=1, inplace=True)
    data.drop('Length',axis=1,inplace=True)
    
    

def mapping(data):
    
    binary_map = {"Yes": 1, "No": 0}
    episode_map = generate_index(data['Episode'].unique())
    station_map = generate_index(data['Station'].unique())
    season_map = generate_index(data['Season'].unique())
    channel_map = generate_index(data['Channel Type'].unique())
    year_map = generate_index(data["Year"].unique())
    week_map = generate_index(data['Day of week'].unique())
    genre_map = generate_index(data['Genre'].unique())
    date_map = generate_index(data['Date'].unique())

   
    data['Station'] = data['Station'].map(station_map)
    data['Episode'] = data['Episode'].map(episode_map)
    data['Channel Type'] = data['Channel Type'].map(channel_map)
    data['Season'] = data['Season'].map(season_map)
    data['Year'] = data["Year"].map(year_map)
    data['Day of week'] = data['Day of week'].map(week_map)
    data['Genre'] = data['Genre'].map(genre_map)
    data['First time or rerun'] = data['First time or rerun'].map(binary_map)
    data['Movie?'] = data['Movie?'].map(binary_map)
    data['# of episode in the season'] = data['# of episode in the season'].map(binary_map)
    data['Game of the Canadiens during episode?'] = data['Game of the Canadiens during episode?'].map(binary_map)
    data['Date'] = data['Date'].map(date_map)
 

In [19]:
   #converting Start_time and End_time to minutes,adding Start_minutes and End_minutes column
#then dropping the Start_time and End_time
def processTime(data):
    data['Start_time'] = pd.to_datetime(data['Start_time'])
    data['Start_minutes'] = data['Start_time'].dt.hour * 60 + data['Start_time'].dt.minute + data[
        'Start_time'].dt.second / 60
    data['End_time'] = pd.to_datetime(data['End_time'])
    data['End_minutes'] = data['End_time'].dt.hour * 60 + data['End_time'].dt.minute + data[
        'End_time'].dt.second / 60
    data.drop('Start_time', axis=1, inplace=True)
    data.drop('End_time', axis=1, inplace=True)

# Preprocessing

In [20]:
#dropping decided features 
dropExtraColumns(train_df)
dropExtraColumns(test_df)

In [21]:
#converting non numeric datas to numeric ones
mapping(test_df)
mapping(train_df)

In [22]:
#removing Nans
train_df = train_df.dropna(axis = 0)
test_df = test_df.dropna(axis= 0)

In [23]:
#converting Start_time and End_time to minutes
processTime(train_df)
processTime(test_df)

In [24]:
train_df.head(3)

Unnamed: 0,Episode,Station,Channel Type,Season,Year,Date,Day of week,Genre,First time or rerun,# of episode in the season,Movie?,Game of the Canadiens during episode?,Market Share_total,Temperature in Montreal during episode,Start_minutes,End_minutes
0,1,1,1,1,1,1,1,1,0,1,0,0,0.9,20.4,360.0,480.0
1,2,1,1,1,1,1,1,2,0,1,0,0,0.5,19.125,480.0,510.0
2,3,1,1,1,1,1,1,3,0,1,0,0,0.3,19.125,510.0,540.0


In [25]:
test_df.head(3)

Unnamed: 0,Episode,Station,Channel Type,Season,Year,Date,Day of week,Genre,First time or rerun,# of episode in the season,Movie?,Game of the Canadiens during episode?,Temperature in Montreal during episode,Start_minutes,End_minutes
0,1,1,1,1,1,1,1,1,0,1,0,0,-22.525,690.0,720.0
1,2,1,1,1,1,1,1,1,0,1,0,0,-21.625,720.0,750.0
2,3,1,1,1,1,1,1,1,0,1,0,0,-21.625,750.0,780.0


In [26]:
y_train = train_df["Market Share_total"]
x_train = train_df.drop('Market Share_total', axis = 1)
x_test = test_df

In [27]:
from sklearn.ensemble import RandomForestRegressor
predictor = RandomForestRegressor(n_estimators=10)
predictor.fit(x_train,y_train)
y_pre = predictor.predict(x_test)
result = pd.DataFrame( {'Market Share_total':y_pre} , columns = ['Market Share_total']) 
result.to_csv('test_prediction.csv')

In [28]:
#evaluating the model over all the train data:
y_train_pre = predictor.predict(x_train)
r2_score(y_train,y_train_pre)

0.9829462239707079

In [29]:
result.shape

(130936, 1)

In [30]:
x_test.shape

(130936, 15)