# Working with JSON in Python
JSON (JavaScript Object Notation) is a lightweight data-interchange format. This notebook demonstrates how to parse, manipulate, and analyze JSON data using Python.

#  What is JSON?

In [3]:
import pandas as pd
from pandas import json_normalize
import json
import datetime


In [5]:


#Parse JSON - OBJECT

STUDENT = {
  "name": "Jim",
  "age": 20,
  "skills": ["Python", "SQL", "Data Analysis"],
  "location": {
    "city": "Nairobi",
    "country": "Kenya"
  }
}

STUDENT["age"]



#DATA STRUCTURE - Dictionary
#Parsed JSON

20

### Exploring a JSON Schema

In [6]:


# Example JSON string
json_data = '''
{
  "name": "Jim",
  "age": 20,
  "skills": ["Python", "SQL", "Data Analysis"],
  "location": {
    "city": "Nairobi",
    "country": "Kenya"
  }
}

'''

# Convert JSON string to a Python dictionary
data = json.loads(json_data)

print(type(data))

data["name"]


<class 'dict'>


'Jim'

### Reading JSON from a File

In [7]:

# Read from a JSON file
with open('../data/user_data.json') as file:
    data = json.load(file)

data


{'name': 'Jim',
 'age': 20,
 'skills': ['Python', 'SQL', 'Data Analysis'],
 'location': {'city': 'Nairobi', 'country': 'Kenya'}}

In [8]:
data["name"]


'Jim'

### Flattening JSON (Unpacking Nested Structures)

### Flattening JSON
Flattening JSON involves unpacking nested structures into a flat table format, making it easier to analyze.

In [9]:


# Nested example
data = {
  "name": "Jim",
  "age": 20,
  "skills": ["Python", "SQL", "Data Analysis"],
  "location": {
    "city": "Nairobi",
    "country": "Kenya"
  }
}

df = json_normalize(data)
df


Unnamed: 0,name,age,skills,location.city,location.country
0,Jim,20,"[Python, SQL, Data Analysis]",Nairobi,Kenya


### Using JSON with Pandas

### Using JSON with Pandas
Pandas provides tools to work with JSON data, such as converting it into a DataFrame for analysis.

In [10]:
json_data = '''
[
    {
        "name": "Alice",
        "age": 28,
        "location": {"city": "Kampala", "country": "Uganda"}
    },
    {
        "name": "Brian",
        "age": 31,
        "location": {"city": "Nairobi", "country": "Kenya"}
    }
]
'''
data = json.loads(json_data)
df = json_normalize(data)
df


Unnamed: 0,name,age,location.city,location.country
0,Alice,28,Kampala,Uganda
1,Brian,31,Nairobi,Kenya


###  Transforming JSON into DataFrames

In [11]:
{
  "users": [
    {"id": 1, "name": "Alice"},
    {"id": 2, "name": "Bob"}
  ]
}


{'users': [{'id': 1, 'name': 'Alice'}, {'id': 2, 'name': 'Bob'}]}

In [12]:
json_data ='''
{
    "users": [
        {"id": 1, "name": "Alice"},
        {"id": 2, "name": "Bob"}
    ]
}
'''
data = json.loads(json_data)
df = pd.DataFrame(data["users"])
df


Unnamed: 0,id,name
0,1,Alice
1,2,Bob


### Real-World Example: API Data

Imagine you call an API that gives you data on weather like this

In [13]:
{
  "location": "Nairobi",
  "forecast": [
    {"day": "Monday", "temp": 25},
    {"day": "Tuesday", "temp": 26}
  ]
}


{'location': 'Nairobi',
 'forecast': [{'day': 'Monday', 'temp': 25}, {'day': 'Tuesday', 'temp': 26}]}

Turn this into a DataFrame.

In [14]:
weather_json ={
  "location": "Nairobi",
  "forecast": [
    { "day": "Monday", "temp": 25 },
    { "day": "Tuesday", "temp": 30 },
    { "day": "Wednesday", "temp": 22 },
    { "day": "Thursday", "temp": 32 },
    { "day": "Friday", "temp": 50 }
  ]
}


df = pd.DataFrame(weather_json["forecast"])
df


Unnamed: 0,day,temp
0,Monday,25
1,Tuesday,30
2,Wednesday,22
3,Thursday,32
4,Friday,50


## Or

In [15]:

# Read from a JSON file
with open('../data/weather_data.json') as file:
    data = json.load(file)

df = pd.DataFrame(data["forecast"])

df

Unnamed: 0,day,temp
0,Monday,25
1,Tuesday,30
2,Wednesday,22
3,Thursday,32
4,Friday,50


### Data Cleaning with JSON + Pandas

In [17]:

# Raw JSON
weather_json = {
    "location": "Nairobi",
    "forecast": [
        { "day": "Monday", "temp": 25 },
        { "day": "Tuesday", "temp": 30 },
        { "day": "Wednesday", "temp": 22 },
        { "day": "Thursday", "temp": 32 },
        { "day": "Friday", "temp": 50 }  # Possibly suspicious
    ]
}

# Normalize
df = pd.DataFrame(weather_json["forecast"])
df["location"] = weather_json["location"]  # Add location
df


Unnamed: 0,day,temp,location
0,Monday,25,Nairobi
1,Tuesday,30,Nairobi
2,Wednesday,22,Nairobi
3,Thursday,32,Nairobi
4,Friday,50,Nairobi


## Handling Missing Values

In [18]:
# Inject missing temperature
df.loc[2, "temp"] = None
df

Unnamed: 0,day,temp,location
0,Monday,25.0,Nairobi
1,Tuesday,30.0,Nairobi
2,Wednesday,,Nairobi
3,Thursday,32.0,Nairobi
4,Friday,50.0,Nairobi


### Detect missing values:

In [19]:

df.isnull().sum()


day         0
temp        1
location    0
dtype: int64

#### Fill with average (recommended):

In [20]:

df["temp"] = df["temp"].fillna(df["temp"].mean())

df


Unnamed: 0,day,temp,location
0,Monday,25.0,Nairobi
1,Tuesday,30.0,Nairobi
2,Wednesday,34.25,Nairobi
3,Thursday,32.0,Nairobi
4,Friday,50.0,Nairobi


#### Drop missing rows (if critical / not recommended):

In [31]:
df.loc[2, "temp"] = None
df_cleaned = df.dropna()

df_cleaned

Unnamed: 0,day,temp,location
0,Monday,25.0,Nairobi
1,Tuesday,30.0,Nairobi
3,Thursday,32.0,Nairobi


## Detecting and Handling Outliers

Check basic stats:

In [47]:
df.describe()

Unnamed: 0,temp
count,4.0
mean,27.25
std,4.573474
min,22.0
25%,24.25
50%,27.5
75%,30.5
max,32.0


Flag or filter outliers:

In [53]:
# Define threshold
df = pd.DataFrame(weather_json["forecast"])
threshold = 35
outliers = df[df["temp"] > threshold]
"Outliers:\n", outliers

# Option 1: Replace with average
# df.loc[df["temp"] > threshold, "temp"] = df["temp"].mean()
# df

# Option 2: Drop outliers
# df = df[df["temp"] <= threshold]
# df


('Outliers:\n',
       day  temp
 4  Friday    50)

 ## Removing Duplicates

In [58]:
df = pd.DataFrame(weather_json["forecast"])
df = pd.concat([df, df.iloc[[1]]], ignore_index=True)
df
# Drop duplicates
# df = df.drop_duplicates()
# df


Unnamed: 0,day,temp
0,Monday,25
1,Tuesday,30
2,Wednesday,22
3,Thursday,32
4,Friday,50
5,Tuesday,30


## Fixing Data Types

In [63]:

df["temp"] = df["temp"].astype(str)  # Simulate strings
df.info()

# df["temp"] = pd.to_numeric(df["temp"], errors="coerce")  # Convert safely
# df.info()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   day     6 non-null      object
 1   temp    6 non-null      object
dtypes: object(2)
memory usage: 228.0+ bytes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   day     6 non-null      object
 1   temp    6 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 228.0+ bytes


## Standardizing Day Names or Text Columns

In [65]:
df.loc[0, "day"] = "monday"  # Simulate inconsistency
df


# df["day"] = df["day"].str.capitalize()
# df


Unnamed: 0,day,temp
0,Monday,25
1,Tuesday,30
2,Wednesday,22
3,Thursday,32
4,Friday,50
5,Tuesday,30


## Creating a Date Column

In [66]:

# Start from a real date
start_date = datetime.date(2025, 4, 15)

# Map to dates
df["date"] = [start_date + datetime.timedelta(days=i) for i in range(len(df))]

df


Unnamed: 0,day,temp,date
0,Monday,25,2025-04-15
1,Tuesday,30,2025-04-16
2,Wednesday,22,2025-04-17
3,Thursday,32,2025-04-18
4,Friday,50,2025-04-19
5,Tuesday,30,2025-04-20


##  Creating Derived Columns (e.g., Temperature Category)

In [67]:
def categorize_temp(temp):
    if temp < 26:
        return "Cool"
    elif temp <= 30:
        return "Warm"
    else:
        return "Hot"

df["temp_category"] = df["temp"].apply(categorize_temp)

df

Unnamed: 0,day,temp,date,temp_category
0,Monday,25,2025-04-15,Cool
1,Tuesday,30,2025-04-16,Warm
2,Wednesday,22,2025-04-17,Cool
3,Thursday,32,2025-04-18,Hot
4,Friday,50,2025-04-19,Hot
5,Tuesday,30,2025-04-20,Warm


## Export Data

In [70]:
df.to_json("../data/cleaned_weather_data.json")

## Importing Cleaned Data

In [71]:
with open("../data/cleaned_weather_data.json") as file:
    data = json.load(file)
    

data = pd.DataFrame(data)

data['date'] = pd.to_datetime(data['date'], unit='ms')
data

Unnamed: 0,day,temp,date,temp_category
0,Monday,25,2025-04-15,Cool
1,Tuesday,30,2025-04-16,Warm
2,Wednesday,22,2025-04-17,Cool
3,Thursday,32,2025-04-18,Hot
4,Friday,50,2025-04-19,Hot
5,Tuesday,30,2025-04-20,Warm
