In [95]:
%%file bigdata.py

from mrjob.job import MRJob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import exp
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import scale
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns

class map_read(MRJob):
    def mapper(self, _, line):
        line = line.split(",")
        if line[2] == "1":
            yield "Female", (line[3], line[4], line[6], line[9], line[10], 
                             line[11], line[12], line[13], line[14],
                             line[15], line[16], line[17], line[18])
        else:
            yield "Male", (line[3], line[4], line[6], line[9], line[10], 
                           line[11], line[12], line[13], line[14],
                           line[15], line[16], line[17], line[18])
            
    def reducer(self, key, values):
        if key == "Female":
            fdeaths = 0
            fcount = 0
            fhospitalized = 0
            fpreexisting = 0
            '''
                index indicates which condition is present in person
                pneumonia, diabetes, copd, asthma, immunosuppressed, hypertension, other, 
                cardiovascular, obesity, renal, tobacco   
            '''
            fcondition = [0,0,0,0,0,0,0,0,0,0,0]
            for x in values:
                fcount += 1
                if x[0] == "1":
                    fhospitalized += 1
                if x[1] != "9999-99-99":
                    fdeaths += 1
                if "1" in x[2:-1]:
                    fpreexisting += 1
                for i in range(len(x[2:-1]) + 1):
                    if x[i + 2] == "1" and x[1] != "9999-99-99":
                        fcondition[i] += 1
                        
            yield "Female Stats", {"count": fcount, "deaths": fdeaths, 
                                   "hospitalized": fhospitalized,
                                   "pre-existing condition": fpreexisting}
            yield "Death Stats", {
            "Female": {
            "deaths": fdeaths,
            "hospitalized": fhospitalized,
            "conditions": {
                "pneumonia": fcondition[0],
                "diabetes": fcondition[1],
                "copd": fcondition[2],
                "asthma": fcondition[3],
                "immunosuppressed": fcondition[4],
                "hypertension": fcondition[5],
                "other": fcondition[6],
                "cardiovascular": fcondition[7],
                "obesity": fcondition[8],
                "renal": fcondition[9],
                "tobacco": fcondition[10]      
                }
            }}
        
        else:
            mdeaths = 0
            mcount = 0
            mhospitalized = 0
            mpreexisting = 0
            '''
                index indicates which condition is present in person
                pneumonia, diabetes, copd, asthma, immunosuppressed, hypertension, other, 
                cardiovascular, obesity, renal, tobacco   
            '''
            mcondition = [0,0,0,0,0,0,0,0,0,0,0]
            for x in values:
                mcount += 1
                if x[0] == "1":
                    mhospitalized += 1
                if x[1] != "9999-99-99":
                    mdeaths += 1
                if "1" in x[2:-1]:
                    mpreexisting += 1
                for i in range(len(x[2:-1]) + 1):
                    if x[i + 2] == "1" and x[1] != "9999-99-99":
                        mcondition[i] += 1
            yield "Male Stats", {"count": mcount, "deaths": mdeaths,
                                 "hospitalized": mhospitalized,
                                 "pre-existing condition": mpreexisting}
            yield "Death Stats", {
            "Male": {
            "deaths": mdeaths,
            "hospitalized": mhospitalized,
            "conditions": {
                "pneumonia": mcondition[0],
                "diabetes": mcondition[1],
                "copd": mcondition[2],
                "asthma": mcondition[3],
                "immunosuppressed": mcondition[4],
                "hypertension": mcondition[5],
                "other": mcondition[6],
                "cardiovascular": mcondition[7],
                "obesity": mcondition[8],
                "renal": mcondition[9],
                "tobacco": mcondition[10]      
                }
            }}
        

if __name__ =='__main__':
    map_read.run()

Overwriting bigdata.py


In [96]:
!python bigdata.py CovidData.csv

"Female Stats"	{"count":525064,"deaths":27402,"hospitalized":443338,"pre-existing condition":208078}
"Death Stats"	{"Female":{"deaths":27402,"hospitalized":443338,"conditions":{"pneumonia":18980,"diabetes":11545,"copd":1772,"asthma":788,"immunosuppressed":1159,"hypertension":13429,"other":1955,"cardiovascular":1725,"obesity":7119,"renal":2354,"tobacco":1120}}}
"Male Stats"	{"count":523512,"deaths":49541,"hospitalized":405206,"pre-existing condition":219872}
"Death Stats"	{"Male":{"deaths":49541,"hospitalized":405206,"conditions":{"pneumonia":34943,"diabetes":16720,"copd":2249,"asthma":692,"immunosuppressed":1459,"hypertension":18632,"other":2591,"cardiovascular":2710,"obesity":10175,"renal":3353,"tobacco":5476}}}


No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\Johnny\AppData\Local\Temp\bigdata.Johnny.20221212.214341.979101
Running step 1 of 1...
job output is in C:\Users\Johnny\AppData\Local\Temp\bigdata.Johnny.20221212.214341.979101\output
Streaming final output from C:\Users\Johnny\AppData\Local\Temp\bigdata.Johnny.20221212.214341.979101\output...
Removing temp directory C:\Users\Johnny\AppData\Local\Temp\bigdata.Johnny.20221212.214341.979101...
