# Twitch PostgreSQL database

### PostgreSQL connection

Based on https://naysan.ca/2020/05/31/postgresql-to-pandas/

In [1]:
# Librairies
import psycopg2
import pandas as pd
import numpy as np
import sys
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Hugo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
# Connection parameters to login
co_param = {
    "host"      : "twitch.caampywfg0rz.us-east-1.rds.amazonaws.com",
    "database"  : "Twitch",
    "user"      : "GaTech_team_96",
    "password"  : "i-love-my-coffee-without-milk-and-sugar-at-800AM"
}

In [3]:
def connect(co_param):
    """
    Connect to the PostgreSQL database server
    """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**co_param)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

In [4]:
def postgresql_to_dataframe(conn, select_query, column_names):
    """
    Tranform a SELECT query into a pandas dataframe
    """
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        cursor.close()
        return 1
    
    # Naturally we get a list of tupples
    tupples = cursor.fetchall()
    cursor.close()
    
    # We just need to turn it into a pandas dataframe
    df = pd.DataFrame(tupples, columns=column_names)
    return df

### stream_data database

In [5]:
# SQL query
sql_query = """SELECT * FROM stream_data"""

# Column names
stream_data_col_names = ["game_id","stream_id","language","started_at","title",
                            "stream_type","user_id","user_name","viewer_count","user_login","game_name",
                            "thumbnail_url","tag_ids","is_mature","time_logged"]

# Retrieving the data
stream_data = postgresql_to_dataframe(connect(co_param), sql_query, stream_data_col_names)
# stream_data.head()

Connecting to the PostgreSQL database...
Connection successful


In [6]:
stream_data.shape

(2591336, 15)

###

In [7]:
# changing formatting from 
stream_data.loc[:,"started_at"] = stream_data.loc[:,"started_at"].map(lambda x: x.rstrip("Z"))
stream_data.loc[:,"started_at"] = stream_data.loc[:,"started_at"].map(lambda x: x.replace("T", " "))
stream_data["stream_duration_hours"] = pd.to_datetime(stream_data["time_logged"])-pd.to_datetime(stream_data["started_at"])
stream_data['stream_duration_hours'] = stream_data['stream_duration_hours']/np.timedelta64(1, 'h')


# Changing is_mature with True =1 & False = 0
stream_data.loc[stream_data["is_mature"] == True, "is_mature"] = 1
stream_data.loc[stream_data["is_mature"] == False, "is_mature"] = 0

stream_data.head()

# stream_data.to_csv('twitch_stream_data.csv')

Unnamed: 0,game_id,stream_id,language,started_at,title,stream_type,user_id,user_name,viewer_count,user_login,game_name,thumbnail_url,tag_ids,is_mature,time_logged,stream_duration_hours
0,66082,47439542941,en,2022-11-09 03:17:16,It’s November 8th🍁🍂 |Get In Here You Beutiful ...,live,706431485,HoneyBearForever,2,honeybearforever,Games + Demos,https://static-cdn.jtvnw.net/previews-ttv/live...,{6ea6bca4-4712-4ab9-a906-e3336a9d8039},0,2022-11-09 12:25:26,9.136111
1,66082,39833459447,ja,2022-11-09 11:37:29,INFINITAS,live,31589811,micchaq,2,micchaq,Games + Demos,https://static-cdn.jtvnw.net/previews-ttv/live...,{6ba1d230-e52f-4d81-b1e0-41f25a8a9f5d},0,2022-11-09 12:25:26,0.799167
2,66082,40052599048,de,2022-11-09 12:17:00,Auf geht es !!! über Neue Follower würde ich...,live,544022492,xxcubaliebrexx,2,xxcubaliebrexx,Games + Demos,https://static-cdn.jtvnw.net/previews-ttv/live...,"{9166ad14-41f1-4b04-a3b8-c8eb838c6be6,0c7fba97...",1,2022-11-09 12:25:26,0.140556
3,66082,41479884235,en,2022-11-09 10:03:27,Steeb got me hooked.,live,795713511,PrimalxCepterk,2,primalxcepterk,Games + Demos,https://static-cdn.jtvnw.net/previews-ttv/live...,{6ea6bca4-4712-4ab9-a906-e3336a9d8039},0,2022-11-09 12:25:26,2.366389
4,66082,39833265943,ko,2022-11-09 10:38:21,내맘대로,live,541384018,커피향가득,2,coffee_scented,Games + Demos,https://static-cdn.jtvnw.net/previews-ttv/live...,{ab2975e3-b9ca-4b1a-a93e-fb61a5d5c3a4},0,2022-11-09 12:25:26,1.784722


0. filtering at english game
1. temp => get the diff of time_logged and started_at
2. how is the game behaving after 1week or 2? >>> defining a target for the MLalgo
        input target
        avg/medium/total viewers >>> 
3. Post-temp >>> sentiment analysis >>> title
4. Audience target > is_mature of the game
5. python dict for the categories of the game



### stream_data_ENG database

In [15]:
# Generating a temp database
df_temp = stream_data

# Selecting only English stream
df_temp = df_temp[df_temp["language"] == "en"]

# Computing the final table
stream_data_ENG = df_temp
stream_data_ENG.head()

stream_data_ENG.to_csv('twitch_data_stream_data_ENG.csv')

### df_game_avg_maturity database

In [9]:
# Generating a temp database
df = stream_data_ENG[["game_id", "time_logged", "is_mature"]].copy()

# changing the format of the col
df["game_id"] = df["game_id"].astype('str')
df["avg_maturity"] = df["is_mature"].astype('float')

# Grouping by "game_id" and "time_logged" and its avg "is_mature"
df_game_avg_maturity = df.groupby(["game_id", "time_logged"]).mean()
df_game_avg_maturity.head(10)

# df_game_avg_maturity.to_csv('twitch_data_avg_maturity.csv')

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_maturity
game_id,time_logged,Unnamed: 2_level_1
102007682,2022-11-09 12:24:55,0.483871
102007682,2022-11-09 12:24:57,0.483871
102007682,2022-11-09 12:25:20,0.483871
102007682,2022-11-09 12:25:26,0.483871
102007682,2022-11-09 12:25:45,0.5
102007682,2022-11-09 12:25:46,0.5
102007682,2022-11-09 12:25:47,0.5
102007682,2022-11-09 12:26:08,0.517241
102007682,2022-11-09 12:26:10,0.517241
102007682,2022-11-09 12:26:44,0.517241


### Twitch Tags and their categories

In [10]:
# Windows
# twitch_tags_cat = pd.read_csv('api_connection\Twitch_tags.csv')

# Linux/Mac
twitch_tags_cat = pd.read_csv('./api_connection/Twitch_tags.csv')

twitch_tags_cat.head()

Unnamed: 0,Category_or_Stream,TagName,TagId
0,Category,4X,7304b834-d065-47d5-9865-c19cd17d2639
1,Category,Action,4d1eaa36-f750-4862-b7e9-d0a13970d535
2,Category,Adventure Game,80427d95-bb46-42d3-bf4d-408e9bdca49a
3,Category,Arcade,7ff66192-68ef-4b69-8906-24736bf66ed0
4,Category,Autobattler,cd2ee226-342b-4e6b-90d5-c14687006b04


In [11]:
# twitch_tags_cat[['TagId', 'TagName']].to_dict()
twitch_tags_cat[twitch_tags_cat['TagId'] == '6ea6bca4-4712-4ab9-a906-e3336a9d8039']

Unnamed: 0,Category_or_Stream,TagName,TagId


### Encoding time of day into 6 blocks

In [12]:
MIN_STREAM_TIME_THRESHOLD = 0.5 # Hours
MAX_STREAM_TIME_THRESHOLD = 7 # Hours

df_with_encoded_time = stream_data_ENG
df_with_encoded_time['log_date'] = pd.to_datetime(df_with_encoded_time['time_logged']).dt.date
df_with_encoded_time['time_logged_encoded'] = pd.to_datetime(df_with_encoded_time['time_logged']).dt.hour
df_with_encoded_time['time_logged_encoded'] =  df_with_encoded_time['time_logged_encoded']//4

df_with_encoded_time = df_with_encoded_time[
    (df_with_encoded_time['stream_duration_hours'] > MIN_STREAM_TIME_THRESHOLD) & 
    (df_with_encoded_time['stream_duration_hours'] < MAX_STREAM_TIME_THRESHOLD)
].reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [13]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

si = SentimentIntensityAnalyzer()

def get_sentiment_row(row):
    sentiment = si.polarity_scores(row)
    return pd.Series([sentiment['pos'], sentiment['neg'], sentiment['neu']])

df_with_encoded_time['positive_sentiment'] = 0.0
df_with_encoded_time['negative_sentiment'] = 0.0
df_with_encoded_time['neutral_sentiment'] = 0.0

df_with_encoded_time[['positive_sentiment',
                     'negative_sentiment',
                     'neutral_sentiment']] =  df_with_encoded_time.title.apply(lambda row: get_sentiment_row(row))

### Feature Engineering

In [14]:
# Add mean, median, total viewership 
df_with_features = df_with_encoded_time.groupby(['time_logged_encoded', 'game_name', 'log_date'])[['viewer_count']].mean().reset_index()
df_with_features = df_with_features.rename(columns={'viewer_count': 'mean_viewer_count'})

temp_df = df_with_encoded_time.groupby(['time_logged_encoded', 'game_name', 'log_date'])[['viewer_count']].median().reset_index()
temp_df = temp_df.rename(columns={'viewer_count': 'median_viewer_count'})
df_with_features = pd.merge(df_with_features, temp_df)

temp_df = df_with_encoded_time.groupby(['time_logged_encoded', 'game_name', 'log_date'])[['viewer_count']].sum().reset_index()
temp_df = temp_df.rename(columns={'viewer_count': 'total_viewer_count'})
df_with_features = pd.merge(df_with_features, temp_df)


# Add mean, median, total stream time 
temp_df = df_with_encoded_time.groupby(['time_logged_encoded', 'game_name', 'log_date'])[['stream_duration_hours']].mean().reset_index()
temp_df = temp_df.rename(columns={'stream_duration_hours': 'mean_stream_duration_hours'})
df_with_features = pd.merge(df_with_features, temp_df)

temp_df = df_with_encoded_time.groupby(['time_logged_encoded', 'game_name', 'log_date'])[['stream_duration_hours']].median().reset_index()
temp_df = temp_df.rename(columns={'stream_duration_hours': 'median_stream_duration_hours'})
df_with_features = pd.merge(df_with_features, temp_df)

temp_df = df_with_encoded_time.groupby(['time_logged_encoded', 'game_name', 'log_date'])[['stream_duration_hours']].sum().reset_index()
temp_df = temp_df.rename(columns={'stream_duration_hours': 'total_stream_duration_hours'})
df_with_features = pd.merge(df_with_features, temp_df)


# Add average matrure rating 
temp_df = df_with_encoded_time.groupby(['time_logged_encoded', 'game_name', 'log_date'])[['is_mature']].mean().reset_index()
df_with_features = pd.merge(df_with_features, temp_df)


# Add mean sentiment
temp_df = df_with_encoded_time.groupby(['time_logged_encoded', 'game_name', 'log_date'])[['positive_sentiment']].mean().reset_index()
temp_df = temp_df.rename(columns={'positive_sentiment': 'mean_positive_sentiment'})
df_with_features = pd.merge(df_with_features, temp_df)

temp_df = df_with_encoded_time.groupby(['time_logged_encoded', 'game_name', 'log_date'])[['negative_sentiment']].mean().reset_index()
temp_df = temp_df.rename(columns={'negative_sentiment': 'mean_negative_sentiment'})
df_with_features = pd.merge(df_with_features, temp_df)

temp_df = df_with_encoded_time.groupby(['time_logged_encoded', 'game_name', 'log_date'])[['neutral_sentiment']].mean().reset_index()
temp_df = temp_df.rename(columns={'neutral_sentiment': 'mean_neutral_sentiment'})
df_with_features = pd.merge(df_with_features, temp_df)

df_with_features.head()

DataError: No numeric types to aggregate

In [None]:
df_with_features.to_csv('twitch_data_processed_daily.csv')

In [None]:
df_with_features

In [None]:
feauture_column_names = list(df_with_features.columns)[3:-3]
feauture_column_names

In [None]:
shift_duration_days = 3

training_data_frame_dictionary = {
    
}
for time_slot in df_with_features.time_logged_encoded.unique():
    df_filtered_on_timeslot = df_with_features[df_with_features['time_logged_encoded'] == time_slot].copy()
    
    training_data_frame_dictionary[time_slot] = {
        
    }
    
    for game in df_filtered_on_timeslot.game_name.unique():
        df_filtered_on_game = df_filtered_on_timeslot[df_filtered_on_timeslot['game_name'] == game].copy()
        df_filtered_on_game = df_filtered_on_game.sort_values(by='log_date').reset_index(drop=True)
        
        
        for col in feauture_column_names:
            df_filtered_on_game['target_'+ col + '_' + str(shift_duration_days)] = df_filtered_on_game[col].shift(-shift_duration_days)
        
        training_data_frame_dictionary[time_slot][game] = df_filtered_on_game.dropna().reset_index(drop=True)

In [None]:
"""
Dictionary with following hierarcy

-time slot
    - game name
        - DataFrame with targets 

""" 
training_data_frame_dictionary[0]['7 Days to Die']

In [None]:
import pickle

with open('saved_dictionary.pkl', 'wb') as f:
    pickle.dump(training_data_frame_dictionary, f)
        
# with open('saved_dictionary.pkl', 'rb') as f:
#     loaded_dict = pickle.load(f)


### game_info database

In [None]:
# SQL query
sql_query = """SELECT * FROM game_info"""

# Column names
game_info_col_names = ["game_id","game_name","game_picture_url","time_logged"]

# Retrieving the data
game_info = postgresql_to_dataframe(connect(co_param), sql_query, game_info_col_names)
game_info.head()

### Random Code

In [None]:
# stream_data_ENG
# ast.literal_eval(stream_data_ENG.tag_ids[0])
import re
h = re.compile("['\{][0-9A-Za-z\-]+[,\}]")

h.match(stream_data_ENG.tag_ids[50])[0][1:-1]