In [12]:
import numpy as np 
import pandas as pd 
import yfinance as yf
import os
import datetime
import plotly.express as px

In [34]:
AMZN = yf.download(tickers = 'AMZN', period = '1Y', interval = '1d')
AMZN

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-12-15,168.598007,173.600006,165.195007,173.315002,173.315002,75794000
2021-12-16,173.368500,174.166000,168.160507,168.871002,168.871002,60876000
2021-12-17,167.710495,170.898499,165.613495,170.017502,170.017502,85542000
2021-12-20,166.850006,167.874496,165.600006,167.078995,167.078995,57372000
2021-12-21,167.850494,170.716507,165.647507,170.417007,170.417007,55956000
...,...,...,...,...,...,...
2022-12-08,89.239998,90.860001,87.879997,90.349998,90.349998,73305900
2022-12-09,88.900002,90.300003,88.629997,89.089996,89.089996,67316900
2022-12-12,89.209999,90.580002,87.870003,90.550003,90.550003,61999800
2022-12-13,95.230003,96.250000,90.519997,92.489998,92.489998,100212000


In [15]:
def get_ratings_data(directory):
    ratings = {}
    col_names = ['Item ID', 'User ID', 'Rating', 'Timestamp']
    
    # Iterate through the data in the given directory
    for subdir, dirs, files in os.walk(directory):

        # For each file, get the filename and extract the DF
        for file in files:
            filename = str(os.path.join(subdir, file))
            # Create a DF of the data and add it to the dict
            df = pd.read_csv(filename, names = col_names)

            # Convert Timestamp into dates
            times = df['Timestamp']
            for i in range(len(times)):
                times[i] = datetime.datetime.fromtimestamp(times[i]).date()
            df = df.drop(labels = 'Timestamp', axis = 1)
            df = pd.concat([df,times], axis = 1)

            # Sort the dataframe by the dates
            df = df.sort_values('Timestamp')
            ratings[get_filename(filename)] = df
           
    # Return the dict of complete ratings from each file
    return ratings

def get_filename(file):
    
    # Remove any reference of directory structures in the file
    filename = file.split('/')[-1]
    
    # Remove the extension
    filename = filename.split('.')[0]
    
    # If I'm using a 5-core data file, remove the _5 that denotes that it is a 5-core
    if '_5' in filename:
        filename = filename.split('_5')[0]
                
    return filename

In [18]:
# Get the average rating across all data
def get_average_ratings(directory):

    # Get the ratings data and get the keys (filenames)
    ratings = get_ratings_data(directory)
    ratings_keys = list(ratings.keys())

    # Make a dict for average ratings
    all_average_ratings = {}
    
    # For each file
    for key in ratings_keys:

        file = get_filename(key)
        
        # Get the dataframe associated with each category
        df = ratings[file]
        # Get the number of unique dates
        dates = df['Timestamp'].unique()

        # Prepare to calculate average ratings
        average_ratings = {}
        average_ratings["Date"] = []
        average_ratings["Average Rating"] = []
        
        # For each unique date
        for date in dates:

            # Get the ratings for that date and take their mean
            values = df.loc[df['Timestamp'] == date]
            average_rating = values['Rating'].mean()

            # Add the date and the average rating to the proper dictionary index
            average_ratings['Date'].append(date)
            average_ratings['Average Rating'].append(average_rating)
        
        # For that file, add the dicts to the average rating list
        all_average_ratings[file] = average_ratings
     
    # Convert each dict to a dataframe for later use/graphing
    for category in all_average_ratings:
        df = pd.DataFrame(data = all_average_ratings[category]['Average Rating'], columns = ['Average Rating'], index = all_average_ratings[category]['Date'])     
        all_average_ratings[category] = df

    # Return all average ratings
    return all_average_ratings

In [19]:
rating_dir = '../data/ratings'

average_ratings = get_average_ratings(rating_dir)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  times[i] = datetime.datetime.fromtimestamp(times[i]).date()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  times[i] = datetime.datetime.fromtimestamp(times[i]).date()


In [35]:
AMZN = AMZN.drop(labels = 'Volume', axis=1)
fig = px.line(AMZN)

fig.show()

In [38]:
for category in average_ratings:
    fig = px.line(average_ratings[category])
    fig.show()