In [10]:
import pandas as pd

# Load the CSV file
file_path = './final_data.csv'

# Read the CSV file
df = pd.read_csv(file_path)

In [11]:
# Deleting the 'ID' and 'State_FIPS' columns
df.drop(['ID', 'State_FIPS'], axis=1, inplace=True)

# Setting 'Name' and 'YEAR' as the new primary key
df.set_index(['Name', 'State', 'YEAR'], inplace=True)

# Ensure the DataFrame is sorted by the index for correct diff calculations
df.sort_index(inplace=True)

# Calculating the difference for each metric and adding new columns for percentage change
metrics = ['FAH', 'FAFH', 'Total nominal food sales', 'Total constant dollar food sales', 
           'NUMBER_OF_EVENTS', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'diversity', 'tons']

for metric in metrics:
    df[f'{metric}_diff'] = df.groupby(level=0)[metric].diff()  # Difference from previous year
    df[f'{metric}_percent_change'] = df.groupby(level=0)[metric].pct_change() * 100  # Percentage change

# Removing rows with the year 2012
df = df[df.index.get_level_values('YEAR') != 2012]

# Saving the modified DataFrame to a CSV file
df.to_csv('modified_data.csv')

         Name Region_type  State  YEAR       FAH      FAFH  \
528   Abilene         MSA  TEXAS  2012  56284.10  47263.71   
1153  Abilene         MSA  TEXAS  2013  57944.13  51265.36   
1778  Abilene         MSA  TEXAS  2014  61201.71  57007.11   
2403  Abilene         MSA  TEXAS  2015  62350.08  62135.25   
432     Akron         CSA   OHIO  2012  25043.29  21068.27   

      Total nominal food sales      FAH.1     FAFH.1  \
528                  103547.81  28,315.20  24,189.32   
1153                 109209.48  28,889.19  25,688.78   
1778                 118208.81  29,801.38  27,887.53   
2403                 124485.33  30,010.40  29,551.13   
432                   46111.57  12,598.69  10,782.63   

      Total constant dollar food sales  ...         tons  diversity_year_diff  \
528                           52504.52  ...   909.464031             0.000000   
1153                          54577.97  ...   923.243490            -0.016197   
1778                          57688.91  ...   9

In [12]:
# Display basic information about the DataFrame
print("DataFrame Information:")
print(df.info())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Check for duplicate rows
duplicate_rows = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_rows}")

# Check for anomalies or outliers in key columns
# Adjust the columns in the list below as per your dataset
key_columns = ['diversity', 'tons', 'FAH', 'FAFH', 'Total nominal food sales', 'NUMBER_OF_EVENTS', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS']

print("\nAnomalies/Outliers Check:")
for column in key_columns:
    print(f"\nColumn: {column}")
    print(f"Minimum value: {df[column].min()}")
    print(f"Maximum value: {df[column].max()}")

# Optional: Display descriptive statistics
print("\nDescriptive Statistics:")
print(df[key_columns].describe())


DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1875 entries, ('Abilene', 'TEXAS', 2013) to ('Zanesville', 'OHIO', 2015)
Data columns (total 30 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   Region_type                                      1875 non-null   object 
 1   FAH                                              1875 non-null   float64
 2   FAFH                                             1875 non-null   float64
 3   Total nominal food sales                         1875 non-null   float64
 4   FAH.1                                            1875 non-null   object 
 5   FAFH.1                                           1875 non-null   object 
 6   Total constant dollar food sales                 1875 non-null   float64
 7   NUMBER_OF_EVENTS                                 1875 non-null   int64  
 8   DAMAGE_PROPERTY                     