<a href="https://colab.research.google.com/github/nilesh3030/Stroke-Prediction/blob/main/Adding_weather_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np

pd.set_option ('display.max_columns', None)
pd.set_option ('display.max_rows', None)

raw_data = pd.read_csv('/content/drive/MyDrive/Stroke_Prediction/Data/stroke_case.csv', low_memory = False)

nearest_branch_super = pd.read_csv('/content/drive/MyDrive/Stroke_Prediction/Data/Nearest_Branch_Data.csv', encoding = 'euc_kr')

weather_data = pd.read_csv('/content/drive/MyDrive/Stroke_Prediction/Data/OBS_AWS_TIM_20221006143609.csv', encoding = 'euc_kr')

#Processing the weather data

In [None]:
weather_data.columns = ['Branch_code', 'Branch_name', 'Date_time', 'Temperature', 'Wind_Speed(m/s)', 'Precipitation(mm)', 'Pressure(hPa)', 'Humidity(%)']

In [None]:
weather_data['Date_time'] = pd.to_datetime(weather_data['Date_time']).dt.strftime('%Y-%m-%d %H')

In [None]:
weather_data.head()

#Processing the Nearest Barnch data

In [None]:
print(nearest_branch_super.shape)
nearest_branch_super.head()

In [None]:
nearest_branch_super.columns

In [None]:
vars = ['jaenan_sn','call_d', 'call_t', 'latitude', 'longitude', 'Branch Name', 'Latitude', 'Longitude']
nearest_branch = nearest_branch_super[vars]

#'latitude', 'longitude' are the locations for the address i.e. patient location
# 'Latitude', 'Longitude' are location respective to branch address

In [None]:
convert_dict = {'call_d': str,
                'call_t': str}
 
nearest_branch = nearest_branch.astype(convert_dict)

nearest_branch["Datetime_patient"] = nearest_branch["call_d"] + nearest_branch["call_t"]

In [None]:
def convert_datetime(row):
  try:
    if len(row['Datetime_patient']) == 11 or len(row['Datetime_patient']) == 12:
      return(pd.to_datetime(row['Datetime_patient'], format = '%Y%m%d%H%M'))
  except: 
    return(np.nan)


nearest_branch['Datetime_converted'] = nearest_branch.apply(lambda row: convert_datetime(row), axis=1)

In [None]:
nearest_branch['Datetime_converted'] = nearest_branch['Datetime_converted'].dt.strftime('%Y-%m-%d %H')

In [None]:
nearest_branch.head()

#Combining weather and nearest-branch data

In [None]:
new_df = pd.merge(nearest_branch, weather_data,  how='left', left_on=['Branch Name','Datetime_converted'], right_on = ['Branch_name','Date_time'])

In [None]:
print(new_df.shape)
new_df.head()

In [None]:
clean_data_with_weather_variables = new_df.dropna()
clean_data_with_weather_variables.shape

#Filtering the raw data and adding the cleaned weather data

In [None]:
# There are many junk columns that have been added so we will remove all of them 
raw_data = raw_data.loc[:,:'pti2']
print(raw_data.shape)

In [None]:
cat_vars_final = ['cv_cc',
'cv_etc',
'cv2_cc',
'cv2_phx_yn',
'cv2_act',
'sex',
'sx1',
'sx2',
'sx3',
'sx4',
'sx5',
'medical_history',
'stroke',
'obstacle2',
'cv2_cphss']

In [None]:
# final numerical variables based on our selection
num_vars_final = ['jaenan_sn', 'age',
'dbp1',
'sbp1',
'pr1',
'rr1',
'bt1',
'spo2_1',
'dbp2',
'sbp2',
'pr2',
'rr2',
'bt2',
'spo2_2']

In [None]:
final_variables = []
for var in cat_vars_final:
  final_variables.append(var)

for var in num_vars_final:
  final_variables.append(var)

In [None]:
filtered_data = raw_data[final_variables]
filtered_data.shape

# Joining the final data with the cleaned weather data

In [None]:
final_data = pd.merge(filtered_data, clean_data_with_weather_variables,  how='inner', on = 'jaenan_sn')

In [None]:
print(final_data.shape)
final_data.head()

#Translating and cleaning the final data

In [None]:
!pip install googletrans==4.0.0-rc1

In [None]:
from googletrans import Translator

translator = Translator()
translations = {}
for column in cat_vars_final:
    unique = final_data[column].unique()
    for element in unique:
        translations[element] = translator.translate(element).text


In [None]:
translations.pop(np.nan)

### Updating the incorrect translations in the translation dictionary

translations.update({'심,뇌혈관계':'Heart, cerebrovascular',
'발음이상':'strange pronunciation',
'사지 저림':'numb feet and arms',
'음성':'negative',
'양성':'positive',
'남':'male',
'어지러움':'Dizziness',
'전신쇠약':'body weekness',
'오심':'misdiagnosis',
'심계항진':'Palpitations',
'질출혈':'Vaginal blooding'})

final_data.replace(translations, inplace=True)

In [None]:
vars = ['positive', 'negative']
final_data = final_data.loc[final_data['cv2_cphss'].isin(vars)]

final_data.shape

In [None]:
#Replacing the target varibles with 0, 1 
final_data['cv2_cphss'].replace(vars , [1, 0], inplace=True)

In [None]:
# we will replace the junk values with the nan
replace_values = {999.0 : np.nan, 999.9 : np.nan, 943.0 : np.nan, 9999.0 : np.nan, 7777.0 : np.nan, 6666.0 : np.nan, 770.0 : np.nan} 
final_data = final_data.replace(replace_values)

In [None]:
null_variables = round((final_data.isna().sum()/len(final_data))*100,2)
#null_variables

In [None]:
def spasm(row):  
    if row['sx1'] == 'Spasm/seizure' or row['sx2'] == 'Spasm/seizure' or row['sx3'] == 'Spasm/seizure' or row['sx4'] == 'Spasm/seizure' or row['sx5'] == 'Spasm/seizure':
        return 1
    return 0

def dizziness(row):  
    if row['sx1'] == 'Dizziness' or row['sx2'] == 'Dizziness' or row['sx3'] == 'Dizziness' or row['sx4'] == 'Dizziness' or row['sx5'] == 'Dizziness':
        return 1
    return 0

def faint(row):  
    if row['sx1'] == 'faint' or row['sx2'] == 'faint' or row['sx3'] == 'faint' or row['sx4'] == 'faint' or row['sx5'] == 'faint':
        return 1
    return 0

def consciousness(row):  
    if row['sx1'] == 'Consciousness' or row['sx2'] == 'Consciousness' or row['sx3'] == 'Consciousness' or row['sx4'] == 'Consciousness' or row['sx5'] == 'Consciousness':
        return 1
    return 0

def paralysis(row):  
    if row['sx1'] == 'paralysis' or row['sx2'] == 'paralysis' or row['sx3'] == 'paralysis' or row['sx4'] == 'paralysis' or row['sx5'] == 'paralysis':
        return 1
    return 0

def disease_history(row):  
    if row['hypertension'] == 'Yes' or row['diabetes'] == 'Yes' or row['chest_disease'] == 'Yes' or row['heart_disease'] == 'Yes' or row['tuberculosis'] == 'Yes'or row['hepatitis'] == 'Yes' or row['liver'] == 'Yes' or row['allergy'] == 'Yes' or row['cancer'] == 'Yes' or row['renal_failure'] == 'Yes':
        return 1
    return 0

final_data['Spasm'] = final_data.apply(lambda row: spasm(row), axis=1)
final_data['Dizziness'] = final_data.apply(lambda row: dizziness(row), axis=1)
final_data['Faint'] = final_data.apply(lambda row: faint(row), axis=1)
final_data['Consciousness'] = final_data.apply(lambda row: consciousness(row), axis=1)
final_data['Paralysis'] = final_data.apply(lambda row: paralysis(row), axis=1)

In [None]:
final_data.columns

In [None]:
final_data = final_data.drop(['sx1', 'sx2', 'sx3', 'sx4', 'sx5', 'cv2_cc', 'cv2_phx_yn', 'jaenan_sn', 'call_d',
       'call_t', 'latitude', 'longitude', 'Branch Name', 'Latitude',
       'Longitude', 'Datetime_patient', 'Datetime_converted', 'Branch_code',
       'Branch_name', 'Date_time',], axis=1)

print(final_data.shape)
final_data.head()