In [None]:
import os
from openai import OpenAI
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages

# Initialize OpenAI client
client = ### ADD YOUR OPENAI API HERE, OR BETTER YET PUT IT AS A SECRET KEY


def load_and_prepare_data(plants_path, acled_path):

    # Read the data
    plants_df = pd.read_csv(plants_path)
    acled_df = pd.read_csv(acled_path)

    # Print column names to see what we're working with
    print("ACLED columns:")
    print(acled_df.columns.tolist())
    print("\nPlants columns:")
    print(plants_df.columns.tolist())

    # Convert to GeoDataFrames
    plants_gdf = gpd.GeoDataFrame(
        plants_df,
        geometry=[Point(xy) for xy in zip(plants_df['Longitude'], plants_df['Latitude'])],
        crs="EPSG:4326"
    )

    acled_gdf = gpd.GeoDataFrame(
        acled_df,
        geometry=[Point(xy) for xy in zip(acled_df['longitude'], acled_df['latitude'])],
        crs="EPSG:4326"
    )

    return plants_gdf, acled_gdf

#    Find ACLED events within specified distance of plants

def find_nearby_events(plants_gdf, acled_gdf, distance_km=1):
    # Project to a coordinate system that uses meters
    plants_proj = plants_gdf.to_crs(epsg=3857)
    acled_proj = acled_gdf.to_crs(epsg=3857)

    # Create buffer around plants
    distance_meters = distance_km * 1000
    plants_proj['buffer'] = plants_proj.geometry.buffer(distance_meters)

    # Spatial join
    plants_buffer = plants_proj.set_geometry('buffer')
    nearby_events = gpd.sjoin(acled_proj, plants_buffer, how="inner", predicate="within")

    # Calculate actual distances
    nearby_events['distance_km'] = nearby_events.apply(
        lambda row: row.geometry.distance(plants_proj.loc[row.index_right, 'geometry']) / 1000,
        axis=1
    )

    return nearby_events

##     Add text-based matching scores using location information and notes THIS APPROACH ISN'T AS USEFUL AS THE BUFFER MATCHING
def text_based_matching(nearby_events, plants_gdf):

    def check_location_match(row):
        # Get plant info
        plant = plants_gdf.loc[row.index_right]

        # Check city/province in location
        location_match = (
            (str(plant['City']).lower() in str(row['location']).lower()) or
            (str(plant['Province']).lower() in str(row['location']).lower())
        )

        # Check notes field
        notes_match = (
            (str(plant['City']).lower() in str(row['notes']).lower()) or
            (str(plant['Province']).lower() in str(row['notes']).lower()) or
            (str(plant['Company name']).lower() in str(row['notes']).lower())
        )

        return location_match or notes_match

    nearby_events['location_match'] = nearby_events.apply(check_location_match, axis=1)

    return nearby_events

#    Summarize the notes of events within a specified distance using OpenAI API. Identify any events related to fisheries, fishing, and fishing activities.
def summarize_notes(events_within_distance):
    # Extract notes
    notes = events_within_distance['notes'].dropna().tolist()

    # Concatenate notes into a single string
    notes_text = " ".join(notes)

    # Define a function to split text into chunks
    def split_into_chunks(text, max_length=100000):
        words = text.split()
        chunks = []
        current_chunk = []
        current_length = 0

        for word in words:
            if current_length + len(word) + 1 > max_length:
                chunks.append(" ".join(current_chunk))
                current_chunk = [word]
                current_length = len(word) + 1
            else:
                current_chunk.append(word)
                current_length += len(word) + 1

        if current_chunk:
            chunks.append(" ".join(current_chunk))

        return chunks

    # Split notes into chunks
    chunks = split_into_chunks(notes_text)

    # Summarize each chunk
    summaries = []
    for chunk in chunks:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "user",
                    "content": f"Summarize the following notes. Identify any events related to fisheries, fishing, and fishing activities: {chunk}"
                }
            ],
            max_tokens=3500
        )
        summaries.append(response.choices[0].message.content.strip())

    # Summarize the collected summaries
    final_summary_response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": "Summarize the following summaries: " + " ".join(summaries)
            }
        ],
        max_tokens=3500
    )

    # Access the content attribute directly
    final_summary = final_summary_response.choices[0].message.content.strip()
    return final_summary

#  Main function to analyze political violence near industrial plants

def analyze_violence_near_plants(plants_path, acled_path, distance_km=1):
    # Load and prepare data
    plants_gdf, acled_gdf = load_and_prepare_data(plants_path, acled_path)

    # Find nearby events
    nearby_events = find_nearby_events(plants_gdf, acled_gdf, distance_km)

    # Add text-based matching
    results = text_based_matching(nearby_events, plants_gdf)

    # Filter events within 2.5 miles (approx. 4.02 km)
    events_within_2_5_miles = results[results['distance_km'] <= 4.02]

    # Summarize notes COMMENTED OUT
    # notes_summary = summarize_notes(events_within_2_5_miles)
    # print("Summary of notes for events within 2.5 miles:")
    # print(notes_summary)

    # Create two types of summaries
    # 1. Company-based summary
    company_summary = results.groupby('Company name').agg({
        'event_id_cnty': 'count',
        'distance_km': ['mean', 'min'],
        'location_match': 'sum',
        'fatalities': 'sum',
        'event_type': lambda x: x.value_counts().index[0]
    }).round(2)

    # 2. Event-based summary
    event_summary = results.groupby('event_id_cnty').agg({
        'Company name': 'count',
        'distance_km': 'min',
        'location_match': 'any',
        'fatalities': 'first'
    }).round(2)

    return results, company_summary, event_summary

#     Create visualizations for the analysis

def visualize_data(results, plants_gdf, acled_gdf):

    # 1. Map of Events and Plants
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    base = plants_gdf.plot(ax=ax, color='blue', markersize=5, label='Plants')
    acled_gdf.plot(ax=base, color='red', markersize=5, label='ACLED Events')
    plt.title('Map of Plants and ACLED Events')
    plt.legend()
    plt.show()

    # 2. Histogram of Distances
    plt.figure(figsize=(8, 6))
    plt.hist(results['distance_km'], bins=30, color='skyblue', edgecolor='black')
    plt.title('Histogram of Distances from Plants to Events')
    plt.xlabel('Distance (km)')
    plt.ylabel('Frequency')
    plt.show()

    # 3. Bar Chart of Event Types
    plt.figure(figsize=(10, 6))
    results['event_type'].value_counts().plot(kind='bar', color='lightgreen')
    plt.title('Frequency of Event Types Near Plants')
    plt.xlabel('Event Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()

    # 4. Scatter Plot of Fatalities vs. Distance
    plt.figure(figsize=(8, 6))
    plt.scatter(results['distance_km'], results['fatalities'], alpha=0.5, color='purple')
    plt.title('Scatter Plot of Fatalities vs. Distance')
    plt.xlabel('Distance (km)')
    plt.ylabel('Fatalities')
    plt.show()

import matplotlib.dates as mdates

def plot_stacked_bar_by_country(results):
    """
    Plot a stacked bar chart of events by country over time, grouped by quarter.
    """
    # Convert event_date to datetime
    results['event_date'] = pd.to_datetime(results['event_date'])

    # Filter events within the buffer
    events_within_buffer = results[results['distance_km'] <= 1]  # Assuming 1 km is the buffer distance

    # Group by country and quarter
    events_within_buffer['quarter'] = events_within_buffer['event_date'].dt.to_period('Q')
    grouped = events_within_buffer.groupby(['country', 'quarter']).size().unstack(fill_value=0)

    # Plot
    ax = grouped.T.plot(kind='bar', stacked=True, figsize=(12, 8), colormap='tab20')
    ax.set_title('Events by Country Over Time (Grouped by Quarter)')
    ax.set_xlabel('Quarter')
    ax.set_ylabel('Number of Events')
    ax.set_xticklabels([str(q) for q in grouped.columns], rotation=45)
    plt.legend(title='Country', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

#     Save analysis results and visualizations to a PDF

def save_to_pdf(results, company_summary, event_summary, plants_gdf, acled_gdf):
    with PdfPages('analysis_report.pdf') as pdf:
        # 1. Map of Events and Plants
        fig, ax = plt.subplots(1, 1, figsize=(10, 10))
        base = plants_gdf.plot(ax=ax, color='blue', markersize=5, label='Plants')
        acled_gdf.plot(ax=base, color='red', markersize=5, label='ACLED Events')
        plt.title('Map of Plants and ACLED Events')
        plt.legend()
        pdf.savefig(fig)
        plt.close(fig)

        # 2. Histogram of Distances
        fig = plt.figure(figsize=(8, 6))
        plt.hist(results['distance_km'], bins=30, color='skyblue', edgecolor='black')
        plt.title('Histogram of Distances from Plants to Events')
        plt.xlabel('Distance (km)')
        plt.ylabel('Frequency')
        pdf.savefig(fig)
        plt.close(fig)

        # 3. Bar Chart of Event Types
        fig = plt.figure(figsize=(10, 6))
        results['event_type'].value_counts().plot(kind='bar', color='lightgreen')
        plt.title('Frequency of Event Types Near Plants')
        plt.xlabel('Event Type')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        pdf.savefig(fig)
        plt.close(fig)

        # 4. Scatter Plot of Fatalities vs. Distance
        fig = plt.figure(figsize=(8, 6))
        plt.scatter(results['distance_km'], results['fatalities'], alpha=0.5, color='purple')
        plt.title('Scatter Plot of Fatalities vs. Distance')
        plt.xlabel('Distance (km)')
        plt.ylabel('Fatalities')
        pdf.savefig(fig)
        plt.close(fig)

        # 5. Add Data Summaries
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.axis('off')
        ax.axis('tight')
        ax.table(cellText=company_summary.values, colLabels=company_summary.columns, loc='center')
        plt.title('Company Summary')
        pdf.savefig(fig)
        plt.close(fig)

        fig, ax = plt.subplots(figsize=(10, 6))
        ax.axis('off')
        ax.axis('tight')
        ax.table(cellText=event_summary.values, colLabels=event_summary.columns, loc='center')
        plt.title('Event Summary')
        pdf.savefig(fig)
        plt.close(fig)

if __name__ == "__main__":
    # PULL IN DATA
    plants_path = "OO Master FMFO Database - Master List of FM_FO Plants Globally.csv"
    acled_path = "2020-02-01-2025-02-20.csv"

    try:
        results, company_summary, event_summary = analyze_violence_near_plants(plants_path, acled_path)

        # Save results
        results.to_csv("detailed_results.csv")
        company_summary.to_csv("company_summary_results.csv")
        event_summary.to_csv("event_summary_results.csv")

        print("Analysis complete! Results saved to detailed_results.csv, company_summary_results.csv, and event_summary_results.csv")

        # Print some basic statistics
        print(f"\nFound {len(results)} events near plants")
        print(f"Average distance to nearest plant: {results['distance_km'].mean():.2f} km")
        print(f"Total fatalities in nearby events: {results['fatalities'].sum()}")
        print("\nMost common event types:")
        print(results['event_type'].value_counts().head())

        # Visualize data
        plants_gdf, acled_gdf = load_and_prepare_data(plants_path, acled_path)
        visualize_data(results, plants_gdf, acled_gdf)

        # Plot stacked bar chart by country
        plot_stacked_bar_by_country(results)

    except Exception as e:
        print(f"Error during analysis: {str(e)}")