In [2]:
import pandas as pd
import os
import numpy as np

# Set working directory
path = "/Users/mau/Library/CloudStorage/Dropbox/Mac/Documents/Dissertation/Chapter 2/Entire_Data"
os.chdir(path)

# # Filter
filter = ['starteventdatetime', 'endeventdatetime', 'playercashableamt', 'wageredamt', 'grosswin', 'theoreticalwin', 
       'currencyinamt', 'maxbet', 'slotdenominationname', 'assetnumber', 
       'theoreticalpaybackpercent', 'playerkey', 'age', 'rank', 'gender']
# Load data into a DataFrame
dtf = pd.read_feather("compressed_data.feather", columns=filter)


In [19]:
# Print how many unique players in dataset
print(len(dtf['playerkey'].unique()))
# Count how many times does the player with the playerkey==1 appear in the dataset
print(dtf['playerkey'].value_counts()[1])

# Count how many times does the player with the playerkey==2 appear in the dataset
print(dtf['playerkey'].value_counts()[2])

# Count how many times does the player with the playerkey==3 appear in the dataset
print(dtf['playerkey'].value_counts()[3])

print(dtf.info())

# Pint gender unique values
print(dtf['gender'].unique())

# Print tail
print(dtf.tail(3))


47231
32085956
478
306
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56269928 entries, 0 to 56269927
Data columns (total 13 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   playercashableamt          float32
 1   wageredamt                 float16
 2   grosswin                   float32
 3   theoreticalwin             float16
 4   currencyinamt              float64
 5   maxbet                     float32
 6   slotdenominationname       object 
 7   assetnumber                float64
 8   theoreticalpaybackpercent  float16
 9   playerkey                  int32  
 10  age                        float16
 11  rank                       float16
 12  gender                     object 
dtypes: float16(5), float32(3), float64(2), int32(1), object(2)
memory usage: 3.0+ GB
None
['' 'M' 'F' ' ' 'U' 'NULL']
          playercashableamt  wageredamt  grosswin  theoreticalwin  \
56269925                NaN         NaN       NaN             NaN   
56269

In [18]:
# Function to reduce memory usage of a dataframe
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    # Iterate through each column of the dataframe
    for col in df.columns:
        col_type = df[col].dtype
        
        # If the column is not an object type, a category type or a datetime type
        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [27]:
# Create new dataframe that containxs only observations where age is not null and not zero
dtf_age = dtf[dtf['age'].notnull() & (dtf['age'] != 0)]

# Print how many players are in dataframe
print('Number of unique players:', len(dtf_age['playerkey'].unique()))

# Info of dataframe without reduction
print(dtf_age.info())

# # Info of dataframe with reduction 
dtf_age = reduce_mem_usage(dtf_age)

dtf_age.to_parquet("dtf_age.parquet", engine='pyarrow', compression='snappy')


Number of unique players: 46489
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24116564 entries, 32085956 to 56269201
Data columns (total 13 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   playercashableamt          float32
 1   wageredamt                 float16
 2   grosswin                   float32
 3   theoreticalwin             float16
 4   currencyinamt              float64
 5   maxbet                     float32
 6   slotdenominationname       object 
 7   assetnumber                float64
 8   theoreticalpaybackpercent  float16
 9   playerkey                  int32  
 10  age                        float16
 11  rank                       float16
 12  gender                     object 
dtypes: float16(5), float32(3), float64(2), int32(1), object(2)
memory usage: 1.5+ GB
None
Memory usage of dataframe is 1517.96 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float32)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float32)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.int32)


Memory usage after optimization is: 1563.96 MB
Decreased by -3.0%


In [28]:
# Create a new dataframe that contains only observations where gender is equal to 'M' or 'F'
dtf_gender = dtf[(dtf['gender'] == 'M') | (dtf['gender'] == 'F')]

# Print how many players are in dataframe
print('Number of unique players:', len(dtf_gender['playerkey'].unique()))

# Info of dataframe without reduction
print(dtf_gender.info())

# # Info of dataframe with reduction 
dtf_gender = reduce_mem_usage(dtf_gender)

dtf_gender.to_parquet("dtf_gender.parquet", engine='pyarrow', compression='snappy')


Number of unique players: 42319
<class 'pandas.core.frame.DataFrame'>
Int64Index: 19805971 entries, 32086434 to 56269927
Data columns (total 13 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   playercashableamt          float32
 1   wageredamt                 float16
 2   grosswin                   float32
 3   theoreticalwin             float16
 4   currencyinamt              float64
 5   maxbet                     float32
 6   slotdenominationname       object 
 7   assetnumber                float64
 8   theoreticalpaybackpercent  float16
 9   playerkey                  int32  
 10  age                        float16
 11  rank                       float16
 12  gender                     object 
dtypes: float16(5), float32(3), float64(2), int32(1), object(2)
memory usage: 1.2+ GB
None
Memory usage of dataframe is 1246.64 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float32)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float32)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.int32)


Memory usage after optimization is: 1284.41 MB
Decreased by -3.0%


In [29]:
# # Create new dataframe that containxs only observations where age is not null and not zero
dtf_age_gender = dtf_gender[dtf_gender['age'].notnull() & (dtf_gender['age'] != 0)]

# Print how many players are in dataframe
print('Number of unique players:', len(dtf_age_gender['playerkey'].unique()))

# Info of dataframe without reduction
print(dtf_age_gender.info())

# # Info of dataframe with reduction 
dtf_age_gender = reduce_mem_usage(dtf_age_gender)

dtf_age_gender.to_parquet("dtf_age_gender.parquet", engine='pyarrow', compression='snappy')

Number of unique players: 41587
<class 'pandas.core.frame.DataFrame'>
Int64Index: 19804359 entries, 32086434 to 56269185
Data columns (total 13 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   playercashableamt          float32
 1   wageredamt                 float32
 2   grosswin                   float32
 3   theoreticalwin             float32
 4   currencyinamt              float32
 5   maxbet                     float32
 6   slotdenominationname       object 
 7   assetnumber                float32
 8   theoreticalpaybackpercent  float32
 9   playerkey                  int32  
 10  age                        float32
 11  rank                       float32
 12  gender                     object 
dtypes: float32(10), int32(1), object(2)
memory usage: 1.3+ GB
None
Memory usage of dataframe is 1284.31 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float32)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float32)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.int32)


Memory usage after optimization is: 1284.31 MB
Decreased by 0.0%
