In [None]:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from datetime import date, timedelta
import time

# This function generates a list of dates between two dates.
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

# This line sets the path to the ChromeDriver executable and the url for the website we want to scrape.
url = 'https://midrug.safenet.co.il/app/'

# This line creates a new instance of the ChromeDriver object.
driver = webdriver.Chrome()

# This line opens the Midrug website.
driver.get(url)

# This line defines the start and end dates for the data scraping.
start_date = date(2020, 1, 1)
end_date = date(2023, 5, 22)

# This line creates an empty list to store the data.
lst = []

# This loop iterates over the dates between the start and end dates.
for single_date in daterange(start_date, end_date):

    # This line selects the "1" crowd from the dropdown menu.
    crowd = Select(driver.find_element(By.ID, 'Crowd'))
    crowd.select_by_value('1')

    # This line clears the text input field for the date.
    date = driver.find_element(By.ID, 'TheDate')
    date.clear()

    # This line enters the date into the text input field.
    date.send_keys(single_date.strftime(r"%d%m%Y"))

    # This line clicks the search button.
    search = driver.find_element(By.XPATH, '//*[@id="DataPlus"]/table/tbody/tr[6]/td[2]/input')
    search.click()

    # This line tries to find the table with the ratings data.
    try:
        time.sleep(0.5)
        table = driver.find_element(By.ID, "Rep2")

    # This line catches the `NoSuchElementException` exception and sets the table to `None`.
    except NoSuchElementException:
        table = None
        pass

    # This line only adds the data to the list if the table exists.
    if table is not None:
        data = pd.read_html(table.get_attribute("outerHTML"), encoding='windows-1255')[0]
        data_lst = data.values.tolist()
        for row in data_lst:
            lst.append(row)

# This line creates a Pandas DataFrame from the list of data.
df = pd.DataFrame(lst, columns=data.columns)

# This line writes the DataFrame to a CSV file.
df.to_csv('text.csv', encoding='windows-1255')

# This line prints the first five rows of the DataFrame.
print(df.head())

# This line quits the ChromeDriver object.
driver.quit()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

# Read the csv file.
df = pd.read_csv('text.csv', encoding='windows-1255')
df.head()

# Create a Datetime column from Date and Time.
df['Datetime'] = df['תאריך'] + ' ' + df['שעת השידור המקורית']
df.head()

df['Datetime'] = pd.to_datetime(df['Datetime'], format='%d/%m/%Y %H:%M')
df = df.drop(['תאריך', 'שעת השידור המקורית'], axis=1)
df.head()

# Renaming the columns.
df = df.rename(columns={'Unnamed: 0': 'ID',
                 'דירוג': 'Ranking',
                 'שם התוכנית': 'TV_Show',
                 'ערוץ/משדר': 'Channel',
                 'משך בדקות': 'Duration',
                 'שיעורי צפייה במשקי בית (%)': 'Ranking_Percent',
                 'אלפי משקי בית': 'Households'})
df.head()

# Perform feature extraction from the Datetime column we made earlier.
df['Month'] = df['Datetime'].dt.month
df['Week'] = df['Datetime'].dt.week
df['Day'] = df['Datetime'].dt.day
df['Hour'] = df['Datetime'].dt.hour
df['Minute'] = df['Datetime'].dt.minute
df['Day_of_week'] = df['Datetime'].dt.day_of_week
df.head()

# Scaling Households.
df['Households'] = df['Households'].map(lambda x : x * 1000)
df.head()

df.describe()

# There are no nulls in the dataset.
df.info()

# Number of TV shows in the dataset.
len(df['TV_Show'].unique())

# Number of channels.
len(df['Channel'].unique())


# What channels do we have in the dataset?
print(df['Channel'].unique())

# How many shows from each channel?
counts = df['Channel'].value_counts()

# We can see that by far, the most shows are in Keshet 12 and Reshet 13, so much that the other channels are insignificant in this dataset.
fig, ax = plt.subplots()
ax.bar(counts.index, counts.values)
ax.set_xlabel('Channel')
ax.set_ylabel('count')
plt.show()

# We drop the rows that don't belong to Keshet 12 or Reshet 13 based on the visualization above.
df = df.drop(df[(df['Channel'] != 'קשת 12') & (df['Channel'] != 'רשת 13')].index)

# We dropped 1,180 rows.
df.info()

# We are indeed left with only Keshet 12 and Reshet 13 in the dataset.
df['Channel'].unique()

# Dropping TV shows that have less than 5 episodes, because they can skew the results.
value_counts = df['TV_Show'].value_counts()
to_drop = value_counts[value_counts < 5].index
df = df[~df['TV_Show'].isin(to_drop)]

# We dropped 1,878 rows.
df.info()

# Indeed we have shows that are 5 episodes and up.
df['TV_Show'].value_counts()

# Creating a correlation matrix and plotting it. we can see households and ranking percent are basically the same thing.
corr = df.corr()

# Plot the correlation matrix
plt.figure(figsize=(10, 10))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

# Dropping Households because we want to predict Ranking_Percent
df = df.drop(['ID', 'Households'], axis=1)
df.head()

# Plotting Ranking percent by hours in the day. we can see that the highest rating is at 21:00 O'clock.
group_hr = df.groupby('Hour')['Ranking_Percent'].mean()

plt.xticks(range(24))
plt.xlim(0, 24)
group_hr.plot(kind='line', figsize=(10, 10))
plt.xlabel('Hour')
plt.ylabel('Percentage')
plt.title('Percentage variable over hours')
plt.show()

# Plotting Ranking percent by days of the month. we can see that the lowest rating is in the 10th of every month, Maybe salaries is the reason?.
group_day = df.groupby('Day')['Ranking_Percent'].mean()

group_day.plot(kind='line', figsize=(10, 10))
plt.xlabel('Day')
plt.ylabel('Percentage')
plt.title('Percentage variable over days')
plt.show()

# Plotting Ranking percent by months of the year. we can see that the lowest rating is in September, and the highest rating is in March.
# Winter and summer maybe?
group_month = df.groupby('Month')['Ranking_Percent'].mean()

group_month.plot(kind='line', figsize=(10, 10))
plt.xlabel('Month')
plt.ylabel('Percentage')
plt.title('Percentage variable over months')
plt.show()

# Plotting Ranking percent by days of the week. we can see that the lowest rating is in Wednesday and the highest in Monday.
group_dow = df.groupby('Day_of_week')['Ranking_Percent'].mean()

group_dow.plot(kind='line', figsize=(10, 10))
plt.xlabel('Day of week')
plt.ylabel('Percentage')
plt.title('Percentage variable over the days of the week')
plt.show()

# Plotting Ranking percent by Duration of a show. we can see spikes around 80 minutes, 100 minutes and 170-180 minutes.
grouped_df = df.groupby('Duration')

# Calculate the mean of the weight column for each group
mean_rank_df = grouped_df['Ranking_Percent'].mean()

# Plot the mean of the weight column for each group
mean_rank_df.plot(kind='line', figsize=(10, 5))

# Add labels to the axes
plt.xlabel('TV_Show')
plt.ylabel('Ranking_Percent')

# Show the plot
plt.show()

# Showing the 20 shows with the highest rating in the last 3 years. first is Hazamar Bamasechah, Second is Married at first sight etc..
# Mostly reality shows.
grouped_df = df.groupby('TV_Show')

# Calculate the mean of the weight column for each group
mean_rank_df = grouped_df['Ranking_Percent'].mean().nlargest(20)

# Plot the mean of the weight column for each group
mean_rank_df.plot(kind='bar', figsize=(10, 5))

# Add labels to the axes
plt.xlabel('TV_Show')
plt.ylabel('Ranking_Percent')

# Show the plot
plt.show()

# Keshet has more rating than Reshet.
grouped_df = df.groupby('Channel')

# Calculate the mean of the weight column for each group
mean_rank_df = grouped_df['Ranking_Percent'].mean()

# Plot the mean of the weight column for each group
mean_rank_df.plot(kind='bar', figsize=(10, 5))

# Add labels to the axes
plt.xlabel('Channel')
plt.ylabel('Ranking_Percent')

# Show the plot
plt.show()

# Saving the processed dataset
df.to_csv('processed.csv', encoding='windows-1255')



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import shap
from sklearn.preprocessing import LabelEncoder

# Read the csv file.
df = pd.read_csv('processed.csv', encoding='windows-1255')
df.head()

# Drop unwanted columns for training.
df = df.drop(['Unnamed: 0', 'Datetime'], axis=1)
df.head()

# Perform label encoding on the categorical variables, the model we chose can handle them automatically but we need it for the SHAP algorithm.
encoders = {}

for column in df.select_dtypes(include=[object]):
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    encoders[column] = le

    encoders['TV_Show'].classes_

    # Seperate the dependent variable from the independent variables.
y = df['Ranking_Percent']
X = df.drop(['Ranking_Percent'], axis=1)

# Split the data to train and test sets and making sure to have an equal representation of all the tv shows in the datasets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=X['TV_Show'], random_state=42)

# Train a baseline model.
model = lgb.LGBMRegressor()
model.fit(X_train, y_train, categorical_feature='auto')

# Get the predictions.
y_pred = model.predict(X_test)

# Evaluation metrics.
rmse = mean_squared_error(y_test, y_pred, squared=False) # Root mean squared error.
mae = mean_absolute_error(y_test, y_pred) # Mean Absolute Error
r2 = r2_score(y_test, y_pred) # R2 Score

rmse
mae
r2

# Search spaces for Bayesian Search.
search_spaces = {
    'boosting_type': Categorical(['gbdt', 'dart']),
    'num_leaves': Integer(20, 200),
    'learning_rate': Real(0.01, 0.2, 'log-uniform'),
    'n_estimators': Integer(50, 1000),
    'max_depth': Integer(5, 50),
    'min_data_in_leaf': Integer(10, 300),
    'max_bin': Integer(100, 400),
    'feature_fraction': Real(0.6, 1.0, 'uniform'),
    'bagging_fraction': Real(0.6, 1.0, 'uniform'),
    'bagging_freq': Integer(0, 20),
    'min_sum_hessian_in_leaf': Real(0, 10),
    'lambda_l1': Real(1e-10, 100, 'log-uniform'),
    'lambda_l2': Real(1e-10, 100, 'log-uniform'),
    'min_gain_to_split': Real(0, 0.7),
}

# Optimization.
lgbm = lgb.LGBMRegressor(objective='regression', metric='mae')

opt = BayesSearchCV(
    lgbm,
    search_spaces,
    n_iter=50,
    cv=5,
    n_jobs=-1
)

opt.fit(X_train, y_train, categorical_feature='auto')

print("Best parameters found: ", opt.best_params_)


# Training a regressor with the best hyperparameters.
best = lgb.LGBMRegressor(**opt.best_params_)
best.fit(X_train, y_train)

# Function to print the evaluation metrics.
def get_scores(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f'RMSE: {rmse}\n MAE: {mae}\n R2: {r2}')

    # Training set evaluation.
y_pred_train = best.predict(X_train)
get_scores(y_train, y_pred_train)

# Test set evaluation.
y_pred_test = best.predict(X_test)
get_scores(y_test, y_pred_test)

# Create a SHAP  tree explainer instance, because LGBM is a tree based algorithm.
explainer = shap.TreeExplainer(best)

# Calculate SHAP values.
shap_values = explainer.shap_values(X_test)

# Plot the SHAP values for the first instance
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0], X_test.iloc[0])

# Get the TV show label.
X_test['TV_Show'].iloc[0]

# Get the TV show name.
encoders['TV_Show'].classes_[144]

# A SHAP summary plot showing the impact of every feature on the outcome.
shap.summary_plot(shap_values, X.columns)

































