In [1]:
import pandas as pd
import glob
import os
from pprint import pprint

In [2]:
DATA_FOLDER = "../DATA/ANALYSIS"
def get_path(path):
    return os.path.join(DATA_FOLDER, path)

In [3]:
def avg_rating_month_data(df_reviews):
    """
    Needs two columns: 'date_year_month_hide' and 'rating_star_cleaned_hide'
    """
    df = df_reviews.copy()
    df.date_year_month_hide = pd.to_datetime(df.date_year_month_hide)
    df.sort_values("date_year_month_hide", inplace=True)
    df["sum_rating"] =  df["rating_star_cleaned_hide"].cumsum()
    df["n_rating"] =  1
    df["n_rating"] =  df["n_rating"].cumsum()
    df["avg_rating_uptodate"] = df["sum_rating"] / df["n_rating"]
    agg_func = {
        "avg_rating_uptodate": "last",
        "rating_star_cleaned_hide": "mean"
    }
    df_months = df.groupby(["date_year_month_hide"]).agg(agg_func).stack().reset_index()
    df_rating_star = df_months[df_months.level_1 == "rating_star_cleaned_hide"].set_index(
                            ["date_year_month_hide", "level_1"]
                        ).unstack(
                            fill_value=0
                        ).asfreq(
                            'MS', fill_value=0
                        ).stack().sort_index(level=1).reset_index()

    df_avg = df_months[df_months.level_1 == "avg_rating_uptodate"].set_index(
                            ["date_year_month_hide", "level_1"]
                        ).unstack(
                            fill_value=0
                        ).asfreq(
                            'MS', method="ffill"
                        ).stack().sort_index(level=1).reset_index()
    df_months = pd.concat([df_rating_star, df_avg])
    return df_months

def n_rating_month(df_reviews):
    """
    Needs two columns: 'date_year_month_hide' and 'rating_star_cleaned_hide', 'text_cleaned_hide'
    """
    df = df_reviews.copy()
    df.date_year_month_hide = pd.to_datetime(df.date_year_month_hide)
    df_n_rating = df.groupby(
            ["date_year_month_hide", "rating_star_cleaned_hide"]
        )["text_cleaned_hide"].count().reset_index()
    df_n_rating = df_n_rating.set_index(
                                ["date_year_month_hide", "rating_star_cleaned_hide"]
                            ).unstack(
                                fill_value=0
                            ).asfreq(
                                'MS', fill_value=0
                            ).stack().sort_index(level=1).reset_index()
    return df_n_rating

def write_dfs(dict_dfs, filename):

    writer = pd.ExcelWriter(filename, engine='xlsxwriter')
    for sheetname, data in dict_dfs.items():
        data.to_excel(writer, sheet_name=sheetname)
    writer.save()

    print(f"{filename} saved")
    
    
def get_analysis(df, filename=""):
    df_avg_rating = avg_rating_month_data(df)
    df_n_rating = n_rating_month(df)

    dict_dfs = {
        "Reviews": df,
        "Average Rating": df_avg_rating,
        "N Rating": df_n_rating,
    }
    
    if filename:
        write_dfs(dict_dfs, filename)
    
    return dict_dfs

In [4]:
filename = '../DATA/RAW/all_reviews.xlsx'
df = pd.read_excel(filename, engine="openpyxl", index_col=0)

In [5]:
dfs = get_analysis(df)

filename_save = get_path(f"analysis_combined.xlsx")
write_dfs(dfs, filename_save)


../DATA/ANALYSIS/analysis_combined.xlsx saved


In [6]:
filename = '../DATA/ANALYSIS/all_reviews_all_companies_recent_translated.xlsx'
df = pd.read_excel(filename, engine="openpyxl", index_col=0)

In [8]:
all_dfs = dict()

for company in df.company.unique():
    dfs = get_analysis(df[df["company"] == company])
    for d in dfs.values():
        d.loc[:, "company"] = company
    for k,v in dfs.items():
        all_dfs[k] = pd.concat([all_dfs.get(k, pd.DataFrame()), v])

filename_save = get_path(f"analysis_combined_recent.xlsx")
write_dfs(all_dfs, filename_save)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in th

../DATA/ANALYSIS/analysis_combined_recent.xlsx saved


In [9]:
filename = '../DATA/RAW/all_reviews_all_companies.xlsx'
df = pd.read_excel(filename, engine="openpyxl", index_col=0)

all_dfs = dict()

for company in df.company.unique():
    dfs = get_analysis(df[df["company"] == company])
    for d in dfs.values():
        d.loc[:, "company"] = company
    for k,v in dfs.items():
        all_dfs[k] = pd.concat([all_dfs.get(k, pd.DataFrame()), v])

filename_save = get_path(f"analysis_combined_all_companies.xlsx")
write_dfs(all_dfs, filename_save)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in th

../DATA/ANALYSIS/analysis_combined_all_companies.xlsx saved
