In [1]:
import os
import re
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# ---------------------------
# Configuration
# ---------------------------
DATA_PATH = "osha_fatalities.csv"   # غيّر هذا إلى مسار ملفك
PLOTS_DIR = "plots"

# Create plots directory
os.makedirs(PLOTS_DIR, exist_ok=True)

In [4]:
# ---------------------------
# Load data
# ---------------------------
df = pd.read_csv(DATA_PATH, parse_dates=['incident_date'], dayfirst=False, infer_datetime_format=True)

# Normalize text columns to lowercase for robust matching
text_cols = ['description', 'plan', 'citation', 'city', 'state', 'day_of_week']
for c in text_cols:
    if c in df.columns:
        df[c] = df[c].astype(str).str.strip().str.lower()
    else:
        # create missing columns to avoid KeyError later
        df[c] = ""

  df = pd.read_csv(DATA_PATH, parse_dates=['incident_date'], dayfirst=False, infer_datetime_format=True)


In [5]:
# Ensure incident_date is datetime and create derived columns
df['incident_date'] = pd.to_datetime(df['incident_date'], errors='coerce')
df = df[~df['incident_date'].isna()].copy()  # drop rows with invalid dates
df['year'] = df['incident_date'].dt.year
df['month'] = df['incident_date'].dt.month
df['month_name'] = df['incident_date'].dt.strftime('%b')
df['quarter'] = df['incident_date'].dt.to_period('Q').astype(str)
df['ym'] = df['incident_date'].dt.to_period('M').astype(str)

In [6]:
# Helper: save figure
def save_fig(fig, fname, dpi=150):
    path = os.path.join(PLOTS_DIR, fname)
    fig.savefig(path, bbox_inches='tight', dpi=dpi)
    plt.close(fig)
    return path

In [7]:
# ---------------------------
# 1 Yearly trend of fatal incidents (line)
# ---------------------------
yearly = df.groupby('year').size().reset_index(name='total')
fig = plt.figure(figsize=(10,5))
sns.lineplot(data=yearly, x='year', y='total', marker='o')
plt.title('Yearly Trend of Fatal Incidents')
plt.xlabel('Year')
plt.ylabel('Total Incidents')
save_fig(fig, '01_yearly_trend.png')

'plots\\01_yearly_trend.png'

In [8]:
# ---------------------------
# 2 Day of week fatalities (bar)
# ---------------------------
dow_order = ['monday','tuesday','wednesday','thursday','friday','saturday','sunday']
dow_counts = df['day_of_week'].value_counts().reindex(dow_order).fillna(0)
fig = plt.figure(figsize=(9,5))
sns.barplot(x=dow_counts.index, y=dow_counts.values)
plt.title('Fatalities by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Count')
plt.xticks(rotation=20)
save_fig(fig, '02_day_of_week.png')

'plots\\02_day_of_week.png'

In [9]:
# ---------------------------
# 3 Top 15 states with the highest fatalities (bar)
# ---------------------------
top_states = df['state'].value_counts().head(15)
fig = plt.figure(figsize=(10,6))
sns.barplot(y=top_states.index, x=top_states.values)
plt.title('Top 15 States by Fatalities')
plt.xlabel('Total Incidents')
plt.ylabel('State')
save_fig(fig, '03_top_states.png')

'plots\\03_top_states.png'

In [10]:
# ---------------------------
# 4 Top 15 cities with the highest fatalities (bar)
# ---------------------------
top_cities = df['city'].value_counts().head(15)
fig = plt.figure(figsize=(10,6))
sns.barplot(y=top_cities.index, x=top_cities.values)
plt.title('Top 15 Cities by Fatalities')
plt.xlabel('Total Incidents')
plt.ylabel('City')
save_fig(fig, '04_top_cities.png')

'plots\\04_top_cities.png'

In [12]:
# ---------------------------
# 5 Count of "fall" keyword in descriptions
# ---------------------------
fall_count = df['description'].str.contains(r'\bfall\b', na=False).sum()
with open(os.path.join(PLOTS_DIR, '05_fall_count.txt'), 'w') as f:
    f.write(f'Number of incidents mentioning "fall": {fall_count}\n')

In [13]:
# ---------------------------
# 6 Distribution by month (bar)
# ---------------------------
month_order = list(range(1,13))
monthly = df.groupby('month').size().reindex(month_order, fill_value=0).reset_index()
monthly.columns = ['month', 'count']
fig = plt.figure(figsize=(10,5))
sns.barplot(x='month', y='count', data=monthly)
plt.title('Fatalities by Month')
plt.xlabel('Month')
plt.ylabel('Total Incidents')
save_fig(fig, '06_monthly_distribution.png')

'plots\\06_monthly_distribution.png'

In [14]:
# ---------------------------
# 7 States with highest % of unknown plans (bar)
# ---------------------------
state_totals = df.groupby('state').size().rename('total')
state_unknown = df[df['plan']=='unknown'].groupby('state').size().rename('unknown')
state_summary = pd.concat([state_totals, state_unknown], axis=1).fillna(0)
state_summary['pct_unknown'] = 100.0 * state_summary['unknown'] / state_summary['total']
state_summary = state_summary[state_summary['total'] >= 10].sort_values('pct_unknown', ascending=False).head(20)
fig = plt.figure(figsize=(10,6))
sns.barplot(y=state_summary.index, x=state_summary['pct_unknown'])
plt.title('Top States by % Unknown Safety Plans (states with >=10 incidents)')
plt.xlabel('% Unknown Plans')
plt.ylabel('State')
save_fig(fig, '07_unknown_plans_pct.png')

'plots\\07_unknown_plans_pct.png'

In [15]:
# ---------------------------
# 8 Year-over-year growth rate (line)
# ---------------------------
year_totals = yearly.set_index('year')['total']
yoy = year_totals.pct_change().fillna(0) * 100
fig = plt.figure(figsize=(10,5))
sns.lineplot(x=yoy.index, y=yoy.values, marker='o')
plt.title('Year-over-Year Growth Rate in Fatalities (%)')
plt.xlabel('Year')
plt.ylabel('Growth Rate (%)')
save_fig(fig, '08_yoy_growth.png')

'plots\\08_yoy_growth.png'

In [16]:
# ---------------------------
# 9 Ladder falls by state (top 15)
# ---------------------------
ladder = df[df['description'].str.contains(r'ladder', na=False)]
ladder_by_state = ladder['state'].value_counts().head(15)
fig = plt.figure(figsize=(10,6))
sns.barplot(y=ladder_by_state.index, x=ladder_by_state.values)
plt.title('Ladder-related Fatalities by State (Top 15)')
plt.xlabel('Count')
plt.ylabel('State')
save_fig(fig, '09_ladder_by_state.png')

'plots\\09_ladder_by_state.png'

In [17]:
# ---------------------------
# 10 Most common causes (simple keyword buckets)
# ---------------------------
keywords = {
    'fall': r'\bfall\b',
    'struck': r'\bstruck\b|\bstruck by\b|\bstruck against\b',
    'collapse': r'\bcollapse\b|\bcollapsed\b',
    'heat': r'\bheat\b|\bheat stroke\b|\bheat-related\b',
    'electrocution': r'\belectrocute\b|\belectrocution\b|electrocuted'
}
counts = {k: df['description'].str.contains(pat, na=False).sum() for k,pat in keywords.items()}
kc = pd.Series(counts).sort_values(ascending=False)
fig = plt.figure(figsize=(8,5))
sns.barplot(x=kc.values, y=kc.index)
plt.title('Counts by Cause Keywords')
plt.xlabel('Count')
plt.ylabel('Cause')
save_fig(fig, '10_cause_keywords.png')

'plots\\10_cause_keywords.png'

In [18]:
# ---------------------------
# 11 Missing citations percentage (pie)
# ---------------------------
citation_unknown = (df['citation'] == 'unknown').sum()
citation_known = len(df) - citation_unknown
fig = plt.figure(figsize=(6,6))
plt.pie([citation_known, citation_unknown], labels=['Known','Unknown'], autopct='%1.1f%%', startangle=140)
plt.title('Known vs Unknown Citations')
save_fig(fig, '11_citation_unknown_pie.png')

'plots\\11_citation_unknown_pie.png'

In [19]:
# ---------------------------
# 12 Quarterly incidents heatmap (year x quarter)
# ---------------------------
q = df.copy()
q['qnum'] = df['incident_date'].dt.quarter
qt = q.groupby(['year','qnum']).size().unstack(fill_value=0)
fig = plt.figure(figsize=(10,6))
sns.heatmap(qt, annot=True, fmt='d', cmap='YlOrRd')
plt.title('Incidents: Year x Quarter')
plt.xlabel('Quarter')
plt.ylabel('Year')
save_fig(fig, '12_quarter_heatmap.png')

'plots\\12_quarter_heatmap.png'

In [20]:
# ---------------------------
# 13 Top 10 states with heat incidents
# ---------------------------
heat = df[df['description'].str.contains(r'\bheat\b', na=False)]
heat_top_states = heat['state'].value_counts().head(10)
fig = plt.figure(figsize=(9,5))
sns.barplot(x=heat_top_states.values, y=heat_top_states.index)
plt.title('Top States by Heat-related Incidents')
plt.xlabel('Count')
plt.ylabel('State')
save_fig(fig, '13_heat_states.png')

'plots\\13_heat_states.png'

In [21]:
# ---------------------------
# 14 Avg incidents per year by state (top 15)
# ---------------------------
state_year = df.groupby(['state','year']).size().reset_index(name='count')
avg_per_year = state_year.groupby('state')['count'].mean().sort_values(ascending=False).head(15)
fig = plt.figure(figsize=(10,6))
sns.barplot(x=avg_per_year.values, y=avg_per_year.index)
plt.title('Average Incidents per Year by State (Top 15)')
plt.xlabel('Avg Incidents/Year')
plt.ylabel('State')
save_fig(fig, '14_avg_per_year_by_state.png')

'plots\\14_avg_per_year_by_state.png'

In [22]:
# ---------------------------
# 15 Cities with > 50 incidents (bar)
# ---------------------------
city_counts = df['city'].value_counts()
hot_cities = city_counts[city_counts > 50].sort_values(ascending=False)
fig = plt.figure(figsize=(10,6))
if len(hot_cities) > 0:
    sns.barplot(x=hot_cities.values, y=hot_cities.index)
    plt.title('Cities with > 50 Incidents')
    plt.xlabel('Count')
    plt.ylabel('City')
    save_fig(fig, '15_cities_over_50.png')
else:
    # create an empty placeholder image with text
    fig = plt.figure(figsize=(8,3))
    plt.text(0.5, 0.5, "No cities with > 50 incidents in dataset", ha='center', va='center')
    plt.axis('off')
    save_fig(fig, '15_cities_over_50_placeholder.png')

In [23]:
# ---------------------------
# 16 Earliest and latest incident (write to text)
# ---------------------------
earliest = df['incident_date'].min().strftime('%Y-%m-%d')
latest = df['incident_date'].max().strftime('%Y-%m-%d')
with open(os.path.join(PLOTS_DIR, '16_date_range.txt'), 'w') as f:
    f.write(f'Earliest incident: {earliest}\nLatest incident: {latest}\n')

In [24]:
# ---------------------------
# 17 Top 10 most common words in descriptions (simple tokenizer)
# ---------------------------
def simple_tokenize(text):
    text = re.sub(r'[^\w\s]', ' ', text)  # remove punctuation
    tokens = text.lower().split()
    stopwords = set(['the','a','an','and','of','on','in','at','to','was','is','by','for','with','from','worker','victim'])
    return [t for t in tokens if t not in stopwords and len(t) > 2]

all_tokens = []
for desc in df['description'].dropna().astype(str):
    all_tokens.extend(simple_tokenize(desc))

top_words = Counter(all_tokens).most_common(20)
tw_df = pd.DataFrame(top_words, columns=['word','count']).set_index('word')
fig = plt.figure(figsize=(10,6))
sns.barplot(x=tw_df['count'], y=tw_df.index)
plt.title('Top Words in Descriptions')
plt.xlabel('Frequency')
plt.ylabel('Word')
save_fig(fig, '17_top_words.png')

'plots\\17_top_words.png'

In [25]:
# ---------------------------
# 18 States appearing in top5 each year (count years in top5)
# ---------------------------
ranked = df.groupby(['year','state']).size().reset_index(name='total')
ranked['rank'] = ranked.groupby('year')['total'].rank(method='dense', ascending=False)
top5_yearly = ranked[ranked['rank'] <= 5]
top_states_persistent = top5_yearly['state'].value_counts().head(20)
fig = plt.figure(figsize=(9,6))
sns.barplot(x=top_states_persistent.values, y=top_states_persistent.index)
plt.title('States Appearing in Top 5 by Year (Count of Years)')
plt.xlabel('Years in Top 5')
plt.ylabel('State')
save_fig(fig, '18_states_in_top5_years.png')

'plots\\18_states_in_top5_years.png'

In [26]:
# ---------------------------
# 19 Day-of-week counts (same as 2) - create another visualization sorted
# ---------------------------
fig = plt.figure(figsize=(9,5))
dow_sorted = df['day_of_week'].value_counts().sort_values(ascending=False)
sns.barplot(x=dow_sorted.values, y=dow_sorted.index)
plt.title('Day of Week - Sorted by Count')
plt.xlabel('Count')
plt.ylabel('Day')
save_fig(fig, '19_day_of_week_sorted.png')

'plots\\19_day_of_week_sorted.png'

In [27]:
# ---------------------------
# 20 Incidents mentioning both "roof" and "fall"
# ---------------------------
roof_fall = df[df['description'].str.contains(r'roof', na=False) & df['description'].str.contains(r'\bfall\b', na=False)]
with open(os.path.join(PLOTS_DIR, '20_roof_and_fall_count.txt'), 'w') as f:
    f.write(f'Incidents mentioning both "roof" and "fall": {len(roof_fall)}\n')

In [28]:
# ---------------------------
# 21 Weekday vs Weekend comparison (bar)
# ---------------------------
df['day_type'] = np.where(df['day_of_week'].isin(['saturday','sunday']), 'Weekend', 'Weekday')
dt_counts = df['day_type'].value_counts()
fig = plt.figure(figsize=(6,4))
sns.barplot(x=dt_counts.index, y=dt_counts.values)
plt.title('Weekday vs Weekend Incidents')
plt.xlabel('Day Type')
plt.ylabel('Count')
save_fig(fig, '21_weekday_vs_weekend.png')

'plots\\21_weekday_vs_weekend.png'

In [29]:
# ---------------------------
# 22 Rolling 12-month fatalities trend (line)
# ---------------------------
monthly_counts = df.groupby('ym').size().reset_index(name='count')
monthly_counts['ym_dt'] = pd.to_datetime(monthly_counts['ym'].astype(str) + '-01', errors='coerce')
monthly_counts = monthly_counts.sort_values('ym_dt')
monthly_counts['rolling_12m'] = monthly_counts['count'].rolling(window=12, min_periods=1).sum()
fig = plt.figure(figsize=(12,5))
sns.lineplot(x='ym_dt', y='rolling_12m', data=monthly_counts, marker='o')
plt.title('Rolling 12-month Fatalities')
plt.xlabel('Date')
plt.ylabel('Rolling 12M Total')
plt.xticks(rotation=30)
save_fig(fig, '22_rolling_12m.png')

'plots\\22_rolling_12m.png'

In [30]:
# ---------------------------
# 23 States with incidents but high unknown citations %
# ---------------------------
state_tot = df.groupby('state').size().rename('total')
state_unknown_cit = df[df['citation']=='unknown'].groupby('state').size().rename('unknown_cit')
state_cit_summary = pd.concat([state_tot, state_unknown_cit], axis=1).fillna(0)
state_cit_summary['pct_unknown_cit'] = 100.0 * state_cit_summary['unknown_cit'] / state_cit_summary['total']
state_cit_summary = state_cit_summary[state_cit_summary['total'] >= 10].sort_values('pct_unknown_cit', ascending=False).head(20)
fig = plt.figure(figsize=(10,6))
sns.barplot(y=state_cit_summary.index, x=state_cit_summary['pct_unknown_cit'])
plt.title('States with High % Unknown Citations (states with >=10 incidents)')
plt.xlabel('% Unknown Citations')
plt.ylabel('State')
save_fig(fig, '23_states_unknown_citations.png')

'plots\\23_states_unknown_citations.png'

In [34]:
# ---------------------------
# 24 Top 5 years with most fatalities
# ---------------------------
yearly = df.groupby(df['incident_date'].dt.year).size().reset_index(name='Total_Incidents')
yearly = yearly.rename(columns={'incident_date': 'Year'})

top5_years = yearly.sort_values('Total_Incidents', ascending=False).head(5)

fig = plt.figure(figsize=(8,4))
sns.barplot(x='Year', y='Total_Incidents', data=top5_years)
plt.title('Top 5 Years by Fatalities')
plt.xlabel('Year')
plt.ylabel('Total Incidents')
save_fig(fig, '24_top5_years.png')
plt.show()

In [32]:
# ---------------------------
# 25 Scaffold incidents by year (line)
# ---------------------------
scaffold = df[df['description'].str.contains(r'scaffold', na=False)]
scaffold_by_year = scaffold.groupby('year').size().reset_index(name='count')
fig = plt.figure(figsize=(10,5))
sns.lineplot(x='year', y='count', data=scaffold_by_year, marker='o')
plt.title('Scaffold-related Incidents by Year')
plt.xlabel('Year')
plt.ylabel('Count')
save_fig(fig, '25_scaffold_by_year.png')

'plots\\25_scaffold_by_year.png'