In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score , mean_absolute_error

In [14]:
train_df = pd.read_csv('data.csv')
test_df = pd.read_csv('test.csv')

In [15]:


def generate_index(list):
    return {value: key for key, value in enumerate(list, 1)}


def dropExtraColumns(data):
    columns = data.columns
    data.drop(columns[0], axis=1, inplace=True)
    data.drop('Temperature in Montreal during episode', axis=1, inplace=True)
    data.drop('Name of episode', axis=1, inplace=True)
    data.drop('Name of show', axis=1, inplace=True)
    
    

def mapping(data):
    binary_map = {"Yes": 1, "No": 0}
    episode_map = generate_index(data['Episode'].unique())
    station_map = generate_index(data['Station'].unique())
    season_map = generate_index(data['Season'].unique())
    channel_map = generate_index(data['Channel Type'].unique())
    year_map = generate_index(data["Year"].unique())
    week_map = generate_index(data['Day of week'].unique())
    genre_map = generate_index(data['Genre'].unique())
    date_map = generate_index(data['Date'].unique())

    data['Station'] = data['Station'].map(station_map)
    data['Episode'] = data['Episode'].map(episode_map)
    data['Channel Type'] = data['Channel Type'].map(channel_map)
    data['Season'] = data['Season'].map(season_map)
    data['Year'] = data["Year"].map(year_map)
    data['Day of week'] = data['Day of week'].map(week_map)
    data['Genre'] = data['Genre'].map(genre_map)
    data['First time or rerun'] = data['First time or rerun'].map(binary_map)
    data['Movie?'] = data['Movie?'].map(binary_map)
    data['# of episode in the season'] = data['# of episode in the season'].map(binary_map)
    data['Game of the Canadiens during episode?'] = data['Game of the Canadiens during episode?'].map(binary_map)
    data['Date'] = data['Date'].map(date_map)
    

def processTime(data):
    data['Start_time'] = pd.to_datetime(data['Start_time'])
    data['Start_minutes'] = data['Start_time'].dt.hour * 60 + data['Start_time'].dt.minute + data[
        'Start_time'].dt.second / 60
    data['End_time'] = pd.to_datetime(data['End_time'])
    data['End_minutes'] = data['End_time'].dt.hour * 60 + data['End_time'].dt.minute + data[
        'End_time'].dt.second / 60
    data.drop('Start_time', axis=1, inplace=True)
    data.drop('End_time', axis=1, inplace=True)

In [16]:
dropExtraColumns(train_df)
dropExtraColumns(test_df)

In [17]:
mapping(test_df)
mapping(train_df)

In [18]:
train_df = train_df.dropna(axis = 0)
test_df = test_df.dropna(axis= 0)

In [19]:
processTime(train_df)
processTime(test_df)

In [20]:
train_df

Unnamed: 0,Episode,Station,Channel Type,Season,Year,Date,Day of week,Length,Genre,First time or rerun,# of episode in the season,Movie?,Game of the Canadiens during episode?,Market Share_total,Start_minutes,End_minutes
0,1,1,1,1,1,1,1,8,1,0,1,0,0,0.9,360.0,480.0
1,2,1,1,1,1,1,1,2,2,0,1,0,0,0.5,480.0,510.0
2,3,1,1,1,1,1,1,2,3,0,1,0,0,0.3,510.0,540.0
3,4,1,1,1,1,1,1,4,4,0,1,0,0,1.7,540.0,600.0
4,5,1,1,1,1,1,1,2,5,0,1,0,0,2.2,600.0,630.0
5,6,1,1,1,1,1,1,2,5,0,1,0,0,2.7,630.0,660.0
6,7,1,1,1,1,1,1,2,6,0,1,0,0,2.3,660.0,690.0
7,8,1,1,1,1,1,1,4,7,0,1,0,0,1.4,690.0,750.0
8,9,1,1,1,1,1,1,4,7,0,1,0,0,7.5,750.0,810.0
9,9,1,1,1,1,1,1,4,7,0,1,0,0,12.1,810.0,870.0


In [21]:
test_df

Unnamed: 0,Episode,Station,Channel Type,Season,Year,Date,Day of week,Length,Genre,First time or rerun,# of episode in the season,Movie?,Game of the Canadiens during episode?,Start_minutes,End_minutes
0,1,1,1,1,1,1,1,2,1,0,1,0,0,690.0,720.0
1,2,1,1,1,1,1,1,2,1,0,1,0,0,720.0,750.0
2,3,1,1,1,1,1,1,2,1,0,1,0,0,750.0,780.0
3,4,1,1,1,1,1,1,4,2,1,1,0,0,780.0,840.0
4,5,1,1,1,1,1,1,4,2,0,1,0,0,840.0,900.0
5,6,1,1,1,1,1,1,4,2,0,1,0,0,900.0,960.0
6,7,1,1,1,1,1,1,4,2,0,1,0,0,960.0,1020.0
7,8,1,1,1,1,1,1,2,1,0,1,0,0,1020.0,1050.0
8,9,1,1,1,1,1,1,2,2,0,1,0,0,1050.0,1080.0
9,10,1,1,1,1,1,1,2,2,0,1,0,0,1080.0,1110.0


In [22]:
y_train = train_df["Market Share_total"]
x_train = train_df.drop('Market Share_total', axis = 1)
x_test = test_df

In [23]:
from sklearn.ensemble import RandomForestRegressor
predictor = RandomForestRegressor()
predictor.fit(x_train,y_train)
y_pre = predictor.predict(x_test)
result = pd.DataFrame( {'Market Share_total':y_pre} , columns = ['Market Share_total']) 
result.to_csv('result.csv')

