### First, we need to import the relevant libraries

In [95]:
import pandas as pd
import numpy as np
import statsmodels.api as sm # We'll use statsmodels for the regressions
import statsmodels.formula.api as smf
import datetime as datetime

### Then, we need to import the datasets we want to use

In [96]:
ucdp = pd.read_csv("/Users/niklas/Documents/Uni/5. Semester/Data Science mit Python/UCDP.csv", low_memory=False)

weo = pd.read_excel("/Users/niklas/Documents/Uni/5. Semester/Data Science mit Python/WEO_Data.xlsx")

### We need to make the datasets ready for statistical analysis

In [97]:
ucdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349733 entries, 0 to 349732
Data columns (total 49 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 349733 non-null  int64  
 1   relid              349733 non-null  object 
 2   year               349733 non-null  int64  
 3   active_year        349733 non-null  int64  
 4   code_status        349733 non-null  object 
 5   type_of_violence   349733 non-null  int64  
 6   conflict_dset_id   349733 non-null  int64  
 7   conflict_new_id    349733 non-null  int64  
 8   conflict_name      349733 non-null  object 
 9   dyad_dset_id       349733 non-null  int64  
 10  dyad_new_id        349733 non-null  int64  
 11  dyad_name          349733 non-null  object 
 12  side_a_dset_id     349733 non-null  int64  
 13  side_a_new_id      349733 non-null  int64  
 14  side_a             349733 non-null  object 
 15  side_b_dset_id     349733 non-null  int64  
 16  si

In [98]:
ucdp.columns

Index(['id', 'relid', 'year', 'active_year', 'code_status', 'type_of_violence',
       'conflict_dset_id', 'conflict_new_id', 'conflict_name', 'dyad_dset_id',
       'dyad_new_id', 'dyad_name', 'side_a_dset_id', 'side_a_new_id', 'side_a',
       'side_b_dset_id', 'side_b_new_id', 'side_b', 'number_of_sources',
       'source_article', 'source_office', 'source_date', 'source_headline',
       'source_original', 'where_prec', 'where_coordinates',
       'where_description', 'adm_1', 'adm_2', 'latitude', 'longitude',
       'geom_wkt', 'priogrid_gid', 'country', 'country_id', 'region',
       'event_clarity', 'date_prec', 'date_start', 'date_end', 'deaths_a',
       'deaths_b', 'deaths_civilians', 'deaths_unknown', 'best', 'high', 'low',
       'gwnoa', 'gwnob'],
      dtype='object')

In [99]:
ucdp["total_deaths"] = ucdp.deaths_a + ucdp.deaths_b + ucdp.deaths_civilians # make a column for the total deaths

ucdp["conflict_dummy"] = ucdp["total_deaths"].apply(lambda x: 1 if x > 1000 else 0) # all conflicts, that bypass a threshold of 1,000 deaths, are coded to 1 and all others are coded to 0

len(ucdp[ucdp["conflict_dummy"] == 1]) # the total threshold identifies 202 conflicts

202

In [100]:
# convert all dates to datetime-objects
ucdp["date_start"] = pd.to_datetime(ucdp["date_start"])
ucdp["date_end"] = pd.to_datetime(ucdp["date_end"])

# add a new DataFrame with years from 1988-2023
conflict = ucdp[(ucdp["year"] >= 1988) & (ucdp["year"] <= 2023)].copy()

In [101]:
# now for the weo-dataset: all years are organized in columns
gdp = weo.melt(id_vars="Real GDP growth (Annual percent change)") # change the dataframe from the wide-format to the long format to match the ucdp-dataset

gdp.rename(columns={"variable": "year", "value": "gdp", "Real GDP growth (Annual percent change)": "country"}, inplace=True) # renaming columns

gdp = gdp.dropna(subset=["country"]) # drop all rows which have NaN-values in the column country
# now, we need to perform imputation on the gdp-dataframe, which is possible, since we are working with time series data

### Imputation of the gdp-dataframe

In [109]:
# first, we need to check, if there are more NaN-values or if the rest is just "no data"
gdp.country.isna().sum() # we have no more NaN-values and can work with the "no data"-cells (imputation)

def impute_missing_gdp(group): # define a function, which changes all missing data to the average of the before and after value
    group['gdp'] = group['gdp'].replace('no data', np.nan)
    if 

# group data after countries and use the function for applying imputation
gdp.groupby('country').apply(impute_missing_gdp).copy(deep=False)

AttributeError: 'tuple' object has no attribute 'country'

### Next, the regression model (local projection) is specified

In [103]:
X = conflict["conflict_dummy"].exog
Y = weo[]

lp = sm.OLS(Y, X)

res = lp.fit()

print(res.summary())

SyntaxError: invalid syntax (1672828355.py, line 2)