<a href="https://colab.research.google.com/github/oceane0815/Pink_Tax/blob/main/Google_trends_pink_tax_state_2014_to_2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import os

# List of raw GitHub URLs for 51 CSV files
csv_files = [
    f'https://raw.githubusercontent.com/oceane0815/Pink_Tax/main/Raw_data_51_regions_copy/file{i}.csv'
    for i in range(1, 52)
]

# Create an empty list to store processed DataFrames
all_data = []

# Loop through each CSV file URL
for url in csv_files:
    # Read the CSV file, skipping the first two rows and using the third row as header (labels)
    df = pd.read_csv(url, skiprows=2)

    # Ensure that we have at least 6 columns (adjust if your data is structured differently)
    if len(df.columns) >= 6:
        # Extract the state name from the 6th column's label (the column header)
        full_label = df.columns[5]  # The 6th column is at index 5

        # Split by ": " and extract the part after "Pink tax: "
        state_name = full_label.split(": ")[1].strip("()")  # Remove parentheses around the state name

        # Calculate the average of baseline states (CA, NY, TX, IL)
        baseline_avg = df.iloc[:, 1:5].mean(axis=1)
        ## iloc[] is the integer-location based indexer in Pandas.
        ## It allows you to select rows and columns based on their position (not labels)
        ## axis=1 means "compute the mean across columns (horizontally)" for each row.

        # Initialize an empty list to store the results
        result = []

        # Loop through each row to apply the custom logic
        for i, avg in enumerate(baseline_avg):
            sixth_col_value = df.iloc[i, 5]  # The 6th column value for the current row

            # Apply the conditions
            if avg == 0 and sixth_col_value == 0:
                result.append(0)  # Both are 0
            elif avg == 0 and sixth_col_value != 0:
                result.append('*')  # Average is 0, but sixth column is non-zero
            else:
                result.append(sixth_col_value / avg)  # Normal case

        # Add the result as a new column to the DataFrame
        df[state_name] = result

        # Append the processed DataFrame to the list, keeping only the 'Month' and the new result column
        all_data.append(df[['Month', state_name]])  # Assuming the first column is 'Month'

# Merge all DataFrames on the 'Month' column
merged_df = all_data[0]
for df in all_data[1:]:
    merged_df = pd.merge(merged_df, df, on='Month', how='outer')

# Save the merged DataFrame to a new CSV file
output_file = '/content/merged_results.csv'
merged_df.to_csv(output_file, index=False)

# Display the first few rows of the merged DataFrame
print(merged_df.head())

# Download the merged file
from google.colab import files
files.download(output_file)


     Month  Alabama  Alaska  Arizona  Arkansas  California).1  Colorado  \
0  2014-01      0.0     0.0      0.0       0.0            0.0       0.0   
1  2014-02      0.0     0.0      0.0       0.0            0.0       0.0   
2  2014-03      0.0     0.0      0.0       0.0            0.0       0.0   
3  2014-04      0.0     0.0      0.0       0.0            0.0       0.0   
4  2014-05      0.0     0.0      0.0       0.0            0.0       0.0   

   Connecticut District of Columbia  Delaware  ...  South Dakota  Tennessee  \
0          0.0                    0       0.0  ...           0.0        0.0   
1          0.0                    0       0.0  ...           0.0        0.0   
2          0.0                    0       0.0  ...           0.0        0.0   
3          0.0                    0       0.0  ...           0.0        0.0   
4          0.0                    0       0.0  ...           0.0        0.0   

  Texas).1  Utah  Vermont  Virginia  Washington  West Virginia  Wisconsin 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
import pandas as pd

# URL of the first CSV file in your GitHub repository
url = 'https://raw.githubusercontent.com/oceane0815/Pink_Tax/main/Raw_data_51_regions_copy/file1.csv'

# Read the CSV file, skipping the first two rows and using the third row as header (labels)
df = pd.read_csv(url, skiprows=2)

# Ensure that we have at least 6 columns (adjust if your data is structured differently)
if len(df.columns) >= 6:
    # Extract the state name from the 6th column's label (the column header)
    full_label = df.columns[5]  # The 6th column is at index 5

    # Split by ": " and extract the part after "Pink tax: "
    state_name = full_label.split(": ")[1].strip("()")  # Remove parentheses around the state name

    # Calculate the average of columns 2 to 5 (assuming 0-based indexing)
    avg_cols_2_to_5 = df.iloc[:, 1:5].mean(axis=1)

    # Initialize an empty list to store the results
    result = []

    # Loop through each row to apply the custom logic
    for i, avg in enumerate(avg_cols_2_to_5):
        sixth_col_value = df.iloc[i, 5]  # The 6th column value for the current row

        # Apply the conditions
        if avg == 0 and sixth_col_value == 0:
            result.append(0)  # Both are 0
        elif avg == 0 and sixth_col_value != 0:
            result.append('*')  # Average is 0, but sixth column is non-zero
        else:
            result.append(sixth_col_value / avg)  # Normal case

    # Add the result as a new column to the DataFrame
    df[state_name + '_Result'] = result

    # Display the first few rows of the DataFrame
    print(df[['Month', state_name + '_Result']].head())

# Save the modified DataFrame to a CSV file in Colab
output_file = '/content/file1_result.csv'
df[['Month', state_name + '_Result']].to_csv(output_file, index=False)

# Optional: If you want to download the file directly from Colab
from google.colab import files
files.download(output_file)



     Month  Alabama_Result
0  2014-01             0.0
1  2014-02             0.0
2  2014-03             0.0
3  2014-04             0.0
4  2014-05             0.0


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>