# 1. Loading original data

In [1]:
import pandas as pd

df = pd.read_csv('../OriginalData/earthquake_data.csv')
# df.head()
# df.info()

print(df.columns)

Index(['In general, how worried are you about earthquakes?',
       'How worried are you about the Big One, a massive, catastrophic earthquake?',
       'Do you think the "Big One" will occur in your lifetime?',
       'Have you ever experienced an earthquake?',
       'Have you or anyone in your household taken any precautions for an earthquake (packed an earthquake survival kit, prepared an evacuation plan, etc.)?',
       'How familiar are you with the San Andreas Fault line?',
       'How familiar are you with the Yellowstone Supervolcano?', 'Age',
       'What is your gender?',
       'How much total combined money did all members of your HOUSEHOLD earn last year?',
       'US Region'],
      dtype='object')


# 2. Shorten columns' names

because original column names are probably the exact text of questions in questionnaire they are impractical to use in further analysis and need to be changed.

In [69]:
df.rename(columns={"In general, how worried are you about earthquakes?": "General fear level", 
                  "How worried are you about the Big One, a massive, catastrophic earthquake?": "Big One fear level", 
                  'Do you think the "Big One" will occur in your lifetime?': "Big One expected", 
                  "Have you ever experienced an earthquake?": "Earthquake experience",
                  "Have you or anyone in your household taken any precautions for an earthquake (packed an earthquake survival kit, prepared an evacuation plan, etc.)?": "Prepered", 
                  "How familiar are you with the San Andreas Fault line?": "Familiarity with SAF", 
                  "How familiar are you with the Yellowstone Supervolcano?": "Familiarity with YS", 
                  "What is your gender?": "Gender", 
                  "How much total combined money did all members of your HOUSEHOLD earn last year?": "Earnings"}, inplace=True)

df.describe(include='all')

Unnamed: 0,General fear level,Big One fear level,Big One expected,Earthquake experience,Prepered,Familiarity with SAF,Familiarity with YS,Age,Gender,Earnings,US Region
count,1013,1013,1013,1006,1006,1001,1001,1001,1001,1001,978
unique,5,5,2,3,2,5,5,4,2,11,9
top,Not at all worried,Not at all worried,No,"Yes, one or more minor ones",No,Somewhat familiar,Somewhat familiar,45 - 59,Female,"$50,000 to $74,999",Pacific
freq,356,334,577,483,788,397,280,275,521,175,206


# 3. Check unique values in all columns 

In [70]:
for column in df.columns:
    print(f'\n{column}: {df[column].unique()}')


General fear level: ['Not at all worried' 'Somewhat worried' 'Not so worried' 'Very worried'
 'Extremely worried']

Big One fear level: ['Not so worried' 'Very worried' 'Somewhat worried' 'Not at all worried'
 'Extremely worried']

Big One expected: ['No' 'Yes']

Earthquake experience: ['Yes, one or more minor ones' 'Yes, one or more major ones' 'No' nan]

Prepered: ['No' 'Yes' nan]

Familiarity with SAF: ['Somewhat familiar' 'Not at all familiar' 'Very familiar' nan
 'Not so familiar' 'Extremely familiar']

Familiarity with YS: ['Not so familiar' 'Not at all familiar' 'Somewhat familiar'
 'Extremely familiar' nan 'Very familiar']

Age: ['18 - 29' nan '30 - 44' '45 - 59' '60']

Gender: ['Male' nan 'Female']

Earnings: ['Prefer not to answer' '$75,000 to $99,999' '$10,000 to $24,999'
 '$25,000 to $49,999' '$200,000 and up' nan '$50,000 to $74,999'
 '$0 to $9,999' '$100,000 to $124,999' '$175,000 to $199,999'
 '$125,000 to $149,999' '$150,000 to $174,999']

US Region: ['New England' 'Ea

# 3. Map respondents' answers to numbers for further analysis

for easier statistical analysis string ansewres are mapped to numerical ones.

In [71]:

mapping_GeneralFearLevel = {'Not at all worried': 0, 'Not so worried': 1, 'Somewhat worried': 2, 'Very worried': 3, 'Extremely worried': 4}

mapping_BigOneFearLevel = mapping_GeneralFearLevel

mapping_EarthquakeExperince = {'No': 0, 'Yes, one or more minor ones': 1, 'Yes, one or more major ones': 2}

mapping_FamiliarityWithSAF = {'Not at all familiar': 0, 'Not so familiar': 1, 'Somewhat familiar': 2, 'Very familiar': 3, 'Extremely familiar': 4}

mapping_FamiliarityWithYS = mapping_FamiliarityWithSAF

df.replace({'General fear level': mapping_GeneralFearLevel,
            'Big One fear level': mapping_BigOneFearLevel,
            'Earthquake experience': mapping_EarthquakeExperince,
            'Familiarity with SAF': mapping_FamiliarityWithSAF,
            'Familiarity with YS': mapping_FamiliarityWithYS }, inplace=True)

df.head()

Unnamed: 0,General fear level,Big One fear level,Big One expected,Earthquake experience,Prepered,Familiarity with SAF,Familiarity with YS,Age,Gender,Earnings,US Region
0,0,1,No,1.0,No,2.0,1.0,18 - 29,Male,Prefer not to answer,New England
1,2,3,No,1.0,No,0.0,0.0,18 - 29,Male,"$75,000 to $99,999",East North Central
2,1,2,No,1.0,No,3.0,2.0,18 - 29,Male,"$10,000 to $24,999",Pacific
3,1,1,No,1.0,No,3.0,1.0,18 - 29,Male,"$25,000 to $49,999",West South Central
4,1,1,Yes,1.0,No,2.0,4.0,18 - 29,Male,"$200,000 and up",Middle Atlantic


# 5. Drop all NaN values

because in original data there were only a small percentage of NaN values it is possible to get rid of them.

In [72]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 978 entries, 0 to 1011
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   General fear level     978 non-null    int64  
 1   Big One fear level     978 non-null    int64  
 2   Big One expected       978 non-null    object 
 3   Earthquake experience  978 non-null    float64
 4   Prepered               978 non-null    object 
 5   Familiarity with SAF   978 non-null    float64
 6   Familiarity with YS    978 non-null    float64
 7   Age                    978 non-null    object 
 8   Gender                 978 non-null    object 
 9   Earnings               978 non-null    object 
 10  US Region              978 non-null    object 
dtypes: float64(3), int64(2), object(6)
memory usage: 91.7+ KB


# 6. Split df to preserve "tidy data" rules - each type of observation unit creates a table 

There are two types of information stored in df: answer and demographics data about respondent. To obey "tidy data" rules df should be splitted in two dataframes - answers and demographics

In [73]:
demographics = df[['Age', 'Gender', 'Earnings', 'US Region']]
answers = df[['General fear level', 'Big One fear level', 'Big One expected',
              'Earthquake experience', 'Prepered', 'Familiarity with SAF',
              'Familiarity with YS']]

# 7. Save results to .csv files

In [74]:
answers.to_csv('../AnalysisData/answers.csv')
demographics.to_csv('../AnalysisData/demographics.csv')