In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Importing Libraries

import numpy as np
import pandas as pd
import math

Preprocessing

In [None]:
# Importing CSV's

changes = pd.read_csv("/content/drive/MyDrive/College/ML Sem 6/Assignment 2/changes-visitors-covid_final.csv")
covid = pd.read_csv("/content/drive/MyDrive/College/ML Sem 6/Assignment 2/covid-data.csv")

In [None]:
# Making necessary changes in the CSV's

changes = changes[changes['Entity']=='India']
changes.rename(columns={"Day":"date","Code":"iso_code","Entity":"location"},inplace=True)

covid = covid[["iso_code","continent","location","date","new_cases"]]
covid = covid[covid['location'] == 'India']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  changes.rename(columns={"Day":"date","Code":"iso_code","Entity":"location"},inplace=True)


In [None]:
# Merging the datasets using date criteria
df = pd.merge(changes, covid, on = ['date','location',"iso_code"])

In [None]:
# Since the atrributes : date, iso_code, continent and location are irrelevent we can drop these columns
df = df[["retail_and_recreation","grocery_and_pharmacy","residential","transit_stations","parks","workplaces","new_cases"]]

In [None]:
# Dropping NULL values
df.dropna()

Unnamed: 0,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,new_cases
0,0.667,1.667,0.000,2.000,3.000,3.000,0.0
1,0.500,1.750,0.000,2.000,3.250,3.000,0.0
2,0.400,1.800,0.200,1.800,2.800,3.200,0.0
3,0.500,2.000,0.000,2.333,3.167,3.333,0.0
4,-0.143,1.714,0.714,1.429,3.571,0.143,0.0
...,...,...,...,...,...,...,...
466,-61.714,-25.000,24.143,-49.143,-41.000,-45.429,173790.0
467,-61.286,-24.429,23.714,-48.714,-40.000,-44.571,165553.0
468,-61.143,-24.714,23.714,-49.000,-39.143,-44.286,152734.0
469,-60.143,-23.429,23.286,-48.286,-38.000,-43.429,127510.0


Functions

In [None]:
# Removing Outliers

def remove_outliers(col_name, df):
    # print("Start")
    Q1 = np.percentile(df[col_name], 25,interpolation = 'midpoint')
    # print(Q1)
    Q3 = np.percentile(df[col_name], 75,interpolation = 'midpoint')
    # print(Q3)
    IQR = Q3 - Q1
    # print(IQR) 
    upper = (Q3+1.5*IQR)
    # print(upper)
    lower = (Q1-1.5*IQR)
    # print(lower)

    n = len(df)

    for x in range(n): 
        # print(x)
        df.loc[df[col_name] < lower,col_name] = np.nan
        df.loc[df[col_name] > upper,col_name] = np.nan

    # print("End")


In [None]:
# Train Val Test Split

# df : dataset
# train_size : proportion of instances for training
# val_size  : proportion of instances for validation

def train_val_test_split(df,train_size,val_size):
    # shuffling the rows randomly to avoid uneveness in train.val and test
    df = df.sample(frac = 1,random_state=24)

    # number of instances
    rows = len(df)
    train_idx = int(len(df)*train_size)
    train = df[0:train_idx]

    val_idx = train_idx + int(len(df)*val_size)
    val = df[train_idx:val_idx]

    test = df[val_idx:]    
    
    return train,val,test

Defining Decision Tree

In [None]:
# CART

def is_numeric(value):
  return isinstance(value, int) or isinstance(value, float)

def class_counts(rows):
  # Counts the unique classes in the dataset
  counts = {}

  for row in rows:
    label = row[-1]
    if label not in counts:
      counts[label] = 0
    
    counts[label] += 1

  return counts


class DecisionNode:
  def __init__(self, threshold = None, true_branch = None, false_branch = None):
    self.threshold = threshold
    self.true_branch = true_branch
    self.false_branch = false_branch

class DecisionLeafNode:
  def __init__(self, predictions):
    self.predictions = class_counts(predictions)

class Threshold:

  def __init__(self, feature, value):
    self.feature = feature
    self.value = value

  def match(self, row):
    # Condition to partition given data into two child nodes based on threshold criteria
    test_value = row[self.feature]
    
    if is_numeric(test_value):
      return test_value >= self.value
    else:
      return test_value == self.value


  def __repr__(self):
      condition = '=='
      if is_numeric(self.value):
          condition = '>='
      return 'Is %s %s %s?' % (
          self.feature, condition, str(self.value))

class DecisionTree:

  def __init__(self, n_features = None, root = None, max_depth = 100):
    self.n_features = n_features
    
    self.root = root
    self.max_depth = max_depth


  def fit(self, training_data):
    self.n_features = len(training_data[0]) - 1
    self.root = self._build_tree(training_data)


  def _partition(self, training_data, threshold):
    # Partitions given node into child nodes using threshold criteria.
    true_values = []
    false_values = []

    for row in training_data:
      if type(threshold.value) == type(row[threshold.feature]):
        if threshold.match(row):
          true_values.append(row)
        else:
          false_values.append(row)

    return true_values, false_values


  def _entropy(self, rows):
    counts = class_counts(rows)
    
    result = 0
    for lbl in counts:
        prob = counts[lbl] / float(len(rows))
        result -= prob ** 2
    
    return result


  def _information_gain(self, true_values, false_values, parent_entropy):
    child_weight = float(len(true_values) / (len(true_values) + len(false_values)))

    return parent_entropy - (
        child_weight * self._entropy(true_values) + (
        1 - child_weight) * self._entropy(false_values))


  def _find_best_split(self, training_data):

    best_gain = 0
    best_threshold = None
    parent_entropy = self._entropy(training_data)

    # Try every feature as root node, its all classes for thresholding and choose 
    # the best one using information gain.
    for feature in range(self.n_features):
      values = set(row[feature] for row in training_data)

      for value in values:
        if math.isnan(value):
          continue

        threshold = Threshold(feature, value)

        true_values, false_values = self._partition(training_data, threshold)

        if len(true_values) == 0 or len(false_values) == 0:
          continue

        gain = self._information_gain(true_values, false_values, parent_entropy)

        if gain >= best_gain:
          best_gain, best_threshold = gain, threshold

    return best_gain, best_threshold


  def _build_tree(self, training_data, depth = 0):

      if depth == self.max_depth:
        return DecisionLeafNode(training_data)

      gain, threshold = self._find_best_split(training_data)

      if gain == 0:
        return DecisionLeafNode(training_data)
    
      true_values, false_values = self._partition(training_data, threshold)

      # Recursively build left and right subtrees.
      true_branch = self._build_tree(true_values, depth + 1)
      false_branch = self._build_tree(false_values, depth + 1)

      return DecisionNode(threshold, true_branch, false_branch)


  def print_tree(self, node = None, spacing = ""):
    if node is None:
      node = self.root

    if isinstance(node, DecisionLeafNode):
      print(spacing + "Predict ", node.predictions)
      return

    print(spacing + str(node.threshold))

    print(spacing + '--> True:')
    self.print_tree(node.true_branch, spacing + "  ")

    print(spacing + '--> False:')
    self.print_tree(node.false_branch, spacing + "  ")


  def _mean_of_leaf(self, predictions):
    val = 0

    for key in predictions.keys():
      val = val + key

    return val / len(predictions)


  def classify(self, test_data, node = None):

    if node is None:
      node = self.root

    if isinstance(node, DecisionLeafNode):
      return self._mean_of_leaf(node.predictions)

    if node.threshold.match(test_data):
      return self.classify(test_data, node.true_branch)
    else:
      return self.classify(test_data, node.false_branch)



class C45Regressor(DecisionTree):

  def _split_info(self, true_values, false_values):

    split_info = 0

    left_ratio = len(true_values) / (len(true_values) + len(false_values))
    right_ratio = len(false_values) / (len(true_values) + len(false_values))

    split_info -= left_ratio * math.log2(left_ratio)
                                   
    split_info -= right_ratio * math.log2(right_ratio)

    return split_info

  def _information_gain(self, true_values, false_values, parent_entropy):
    child_weight = float(len(true_values) / (len(true_values) + len(false_values)))

    information_gain = parent_entropy - (
        child_weight * self._entropy(true_values) + (
        1 - child_weight) * self._entropy(false_values))
    
    split_info = self._split_info(true_values, false_values)

    return information_gain / split_info



Functions 

In [None]:
#  Function to calculate error

def square_errors(actual,predicted):
  square_error = 0

  for i in range(len(actual)):
    square_error = square_error + ((actual[i][-1] - predicted[i]) ** 2)
    
  mean_square_error = square_error / len(actual)

  root_mean_square_error = math.sqrt(mean_square_error)

  return mean_square_error, root_mean_square_error

In [None]:
# Function to find best max_depth

def find_best_depth1(x_train):

  least_rmse = 100000000000
  best_depth = 100

  for depth in range(2, 20):

      model = DecisionTree(max_depth = depth)
      model.fit(x_train.values.tolist())

      # pred_train = model.classify(x_train.values.tolist())
      results = []

      for row in x_val.values.tolist():
        results.append(model.classify(row))

      mse_val, rmse_val = square_errors(x_val.values.tolist(), results)

      if least_rmse > rmse_val:
          least_rmse = rmse_val
          best_depth = depth

  return best_depth


def find_best_depth2(x_train):

  least_rmse = 100000000000
  best_depth = 100

  for depth in range(2, 20):

      model = C45Regressor(max_depth = depth)
      model.fit(x_train.values.tolist())

      # pred_train = model.classify(x_train.values.tolist())
      results = []

      for row in x_val.values.tolist():
        results.append(model.classify(row))

      mse_val, rmse_val = square_errors(x_val.values.tolist(), results)

      if least_rmse > rmse_val:
          least_rmse = rmse_val
          best_depth = depth

  return best_depth

A. Predict mobility from new_cases

In [None]:
# remove_outliers("new_cases",df)
# df.dropna()

In [None]:
#  train : used for training
# val : used for validation
# test : used for testing

train_size = 0.7
val_size = (1-train_size)/2

train, val, test = train_val_test_split(df, train_size, val_size)

In [None]:
x_train = train[['new_cases', 'retail_and_recreation']]

x_val = val[['new_cases', 'retail_and_recreation']]

x_test = test[['new_cases', 'retail_and_recreation']]

In [None]:
# CART

decisionTree = DecisionTree(max_depth = find_best_depth1(x_train))
decisionTree.fit(x_train.values.tolist())
predictions = []
for row in x_test.values.tolist():
    predictions.append(decisionTree.classify(row))
mse_val, rmse_val = square_errors(x_test.values.tolist(), predictions)
print(mse_val)
print(rmse_val)

257.4800729335769
16.046185619441676


In [None]:
# C4.5

c45Regressor = C45Regressor(max_depth = find_best_depth2(x_train))
c45Regressor.fit(x_train.values.tolist())
predictions = []

for row in x_test.values.tolist():
  predictions.append(c45Regressor.classify(row))

mse_val, rmse_val= square_errors(x_test.values.tolist(), predictions)
                                    
print(mse_val)
print(rmse_val)


457.51844725040195
21.38968085901241


B. Predict new_cases from mobility

In [None]:
#  train : used for training
# val : used for validation
# test : used for testing

train_size = 0.7
val_size = (1-train_size)/2

train, val, test = train_val_test_split(df, train_size, val_size)

In [None]:
x_train = train[['retail_and_recreation', 'new_cases']]

x_val = val[['retail_and_recreation', 'new_cases']]

x_test = test[['retail_and_recreation', 'new_cases']]

In [None]:
# CART

decisionTree = DecisionTree(max_depth = find_best_depth1(x_train))
decisionTree.fit(x_train.values.tolist())
predictions = []
for row in x_test.values.tolist():
    predictions.append(decisionTree.classify(row))
mse_val, rmse_val = square_errors(x_test.values.tolist(), predictions)
print(mse_val)
print(rmse_val)

5493578326.905828
74118.67731487003


In [None]:
# x_train.values.tolist()

In [None]:
# C4.5

c45Regressor = C45Regressor(max_depth = find_best_depth2(x_train))
c45Regressor.fit(x_train.values.tolist())
predictions = []

for row in x_test.values.tolist():
  predictions.append(c45Regressor.classify(row))

mse_val, rmse_val= square_errors(x_test.values.tolist(), predictions)
                                    
print(mse_val)
print(rmse_val)

5421584386.186436
73631.40896510426


C. Using all mobilities to predict new cases

In [None]:
# remove_outliers("grocery_and_pharmacy",df)
# remove_outliers("residential",df)
# remove_outliers("workplaces",df)
# df.dropna()

In [None]:
#  train : used for training
# val : used for validation
# test : used for testing

train_size = 0.7
val_size = (1-train_size)/2

train, val, test = train_val_test_split(df, train_size, val_size)

In [None]:
x_train = train[["retail_and_recreation","grocery_and_pharmacy","residential","transit_stations","parks","workplaces", "new_cases"]]

x_val = val[["retail_and_recreation","grocery_and_pharmacy","residential","transit_stations","parks","workplaces", "new_cases"]]

x_test = test[["retail_and_recreation","grocery_and_pharmacy","residential","transit_stations","parks","workplaces", "new_cases"]]



In [None]:
# CART

decisionTree = DecisionTree(max_depth = find_best_depth1(x_train))
decisionTree.fit(x_train.values.tolist())
predictions = []
for row in x_test.values.tolist():
    predictions.append(decisionTree.classify(row))
mse_val, rmse_val = square_errors(x_test.values.tolist(), predictions)
print(mse_val)
print(rmse_val)

2781715177.4268527
52741.967894901805


In [None]:
# C4.5

c45Regressor = C45Regressor(max_depth = find_best_depth2(x_train))
c45Regressor.fit(x_train.values.tolist())
predictions = []

for row in x_test.values.tolist():
  predictions.append(c45Regressor.classify(row))

mse_val, rmse_val= square_errors(x_test.values.tolist(), predictions)
                                    
print(mse_val)
print(rmse_val)

5568592658.964429
74623.00355094553
