In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
import seaborn as sns
import warnings 

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

**Data Source**

https://www.kaggle.com/datasets/thedevastator/std-infection-rates-in-america-1996-2008

In [2]:
''' reading dataset  df1 '''
df1 = pd.read_csv('STD Cases.csv')

In [4]:
df1.shape

(42680, 11)

In [5]:
df1.describe()

Unnamed: 0,index,Disease Code,Year,STD Cases,Population,Rate per 100K
count,42680.0,42530.0,42530.0,42530.0,35483.0,35483.0
mean,21339.5,283.795533,2005.240607,603.133647,438979.4,325.66186
std,12320.799081,13.723063,5.431696,1973.091099,760558.8,697.545461
min,0.0,274.0,1996.0,1.0,12937.0,0.02
25%,10669.75,274.0,2001.0,8.0,94181.0,6.95
50%,21339.5,280.0,2005.0,59.0,196628.0,52.89
75%,32009.25,280.0,2010.0,380.0,432225.0,302.955
max,42679.0,310.0,2014.0,46885.0,8880836.0,9078.95


In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42680 entries, 0 to 42679
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          42680 non-null  int64  
 1   Disease        42530 non-null  object 
 2   Disease Code   42530 non-null  float64
 3   State          42530 non-null  object 
 4   Year           42530 non-null  float64
 5   Gender         42530 non-null  object 
 6   Age            42530 non-null  object 
 7   Age Code       42530 non-null  object 
 8   STD Cases      42530 non-null  float64
 9   Population     35483 non-null  float64
 10  Rate per 100K  35483 non-null  float64
dtypes: float64(5), int64(1), object(5)
memory usage: 3.6+ MB


**Takeaways**

- 42680 rows and 11 columns.
- Have null values that need to be addressed. Either drop nulls or find reasonable method to imput null values.
- will need to convert some data types like `Year`

**Things to do**

- Complete data dictionary
- Preview Data to see actual values
- Come up with some hypothesis to plan project goals.


**Data Dictionary**

Column name	      |Description
Disease	          The name of the STD. (String)
Disease Code	  The code for the STD. (String)
State	          The state where the STD was found. (String)
Year	          The year the STD was found. (Integer)
Gender	          The gender of the person with the STD. (String)
Age	              The age of the person with the STD. (Integer)
Age Code	      The code for the age group of the person with the STD. (String)
STD Cases	      The number of STD cases. (Integer)
Population	      The population of the state where the STD was found. (Integer)
Rate per 100K	  The rate of STD cases per 100,000 people. (Float)

In [12]:
df1

Unnamed: 0,index,Disease,Disease Code,State,Year,Gender,Age,Age Code,STD Cases,Population,Rate per 100K
0,0,Chlamydia,274.0,Alabama,1996.0,Male,0-14 years,0-14,25.0,468394.0,5.34
1,1,Chlamydia,274.0,Alabama,1996.0,Male,15-19 years,15-19,164.0,165920.0,98.84
2,2,Chlamydia,274.0,Alabama,1996.0,Male,20-24 years,20-24,193.0,152848.0,126.27
3,3,Chlamydia,274.0,Alabama,1996.0,Male,25-29 years,25-29,88.0,152778.0,57.60
4,4,Chlamydia,274.0,Alabama,1996.0,Male,30-34 years,30-34,55.0,155850.0,35.29
...,...,...,...,...,...,...,...,...,...,...,...
42675,42675,,,,,,,,,,
42676,42676,,,,,,,,,,
42677,42677,,,,,,,,,,
42678,42678,,,,,,,,,,


In [11]:
df1[df1.isna().any(axis=1)]

Unnamed: 0,index,Disease,Disease Code,State,Year,Gender,Age,Age Code,STD Cases,Population,Rate per 100K
7,7,Chlamydia,274.0,Alabama,1996.0,Male,Unknown,99,63.0,,
29,29,Chlamydia,274.0,Alabama,1999.0,Male,Unknown,99,25.0,,
37,37,Chlamydia,274.0,Alabama,2000.0,Male,Unknown,99,28.0,,
45,45,Chlamydia,274.0,Alabama,2001.0,Male,Unknown,99,24.0,,
53,53,Chlamydia,274.0,Alabama,2002.0,Male,Unknown,99,18.0,,
...,...,...,...,...,...,...,...,...,...,...,...
42675,42675,,,,,,,,,,
42676,42676,,,,,,,,,,
42677,42677,,,,,,,,,,
42678,42678,,,,,,,,,,
