In [1]:
import numpy as np
import pandas as pd

## Numpy

In [4]:
arr = np.array([[1,2,3], [4,5,6]])
arr.shape # (2, 3)
arr.ndim # 2
arr.max(axis=1) # [3, 6]

np.where(arr==2, True, False)

array([[False,  True, False],
       [False, False, False]])

## Pandas

### Sampling

In [7]:
df = pd.read_csv(r"C:\Users\nuguz\Downloads\data\bike.csv")
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [13]:
from sklearn.model_selection import train_test_split

df.sample(n=2, random_state=123) # 랜덤추출
df.groupby("season").sample(n=2, random_state=123) # 층화표본추출
df.sample(frac=0.005, random_state=123)
df_train, df_test = train_test_split(df, train_size=0.7, random_state=123) # train/test split

df.sample(n=2).reset_index(drop=True)

### Missing Value

In [15]:
df = pd.read_csv(r"C:\Users\nuguz\Downloads\data\iris_missing.csv")
df.head(2)

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Species
0,5.1,,,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [23]:
df.isna().sum() # 결측값 확인 

Sepal_Length     7
Sepal_Width     13
Petal_Length    14
Petal_Width      6
Species          2
dtype: int64

In [28]:
df.fillna(value = {"Sepal_Length":999, "Sepal_Width":999}) # 특정값으로 채우기
df.fillna(value={"Sepal_Width":df["Sepal_Width"].mean()}) # 평균값으로 채우기
# + quantile / median / std
df["Sepal_Length"].quantile(q=0.5)
df["Sepal_Length"].median()
df["Sepal_Length"].std()
df.dropna(how='any') # 없애기

### Preprocessing

- 조건에 따른 변수 값 생성
    df[col] = np.where(조건, 맞을 경우, 틀릴 경우)
- 변수 사이의 관계 확인
    pd.crosstab(Series, Series)
- 함수 적용
    df.apply(func = lambda x : ...)
- 날짜 처리
- one-hot encoding

In [29]:
df = pd.read_csv(r"C:\Users\nuguz\Downloads\data\iris.csv")
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [36]:
df["is_setosa"] = np.where(df["Species"]=="setosa", 10, 0) # 새로운 변수 생성
pd.crosstab(df["Species"], df["is_setosa"]).reset_index() # 각 변수 관계

is_setosa,Species,0,10
0,setosa,0,50
1,versicolor,50,0
2,virginica,50,0


In [40]:
df.rename(columns={"Sepal_Length": "SL"}) # column 명 변경
df.columns = ["SL", "SW", "PL", "PW", "species", "is_setosa"] # column 명 모두 변경 할 경우

In [42]:
df.iloc[:, :3].apply(func = lambda x : round(x.mean())).reset_index()

Unnamed: 0,index,0
0,SL,6
1,SW,3
2,PL,4


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SL         150 non-null    float64
 1   SW         150 non-null    float64
 2   PL         150 non-null    float64
 3   PW         150 non-null    float64
 4   species    150 non-null    object 
 5   is_setosa  150 non-null    int32  
dtypes: float64(4), int32(1), object(1)
memory usage: 6.6+ KB


####  - String

In [47]:
df = pd.read_csv(r"C:\Users\nuguz\Downloads\data\bike.csv")
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [58]:
df["casual"].astype("str") + "대" # 3대 ...
df["datetime"].str.slice(0,4) # 2011 년도만 뽑기

pd.to_datetime(df["datetime"]).dt.year
pd.to_datetime(df["datetime"]).dt.weekday
pd.to_datetime(df["datetime"]).dt.month
pd.to_datetime(df["datetime"]).dt.day
pd.to_datetime(df["datetime"]).dt.date[:4]

0    2011-01-01
1    2011-01-01
2    2011-01-01
3    2011-01-01
Name: datetime, dtype: object

In [64]:
pd.get_dummies(data=df, columns=["season"], drop_first=False).iloc[:2, 11:] # one-hot
pd.get_dummies(data=df, columns=["season"], drop_first=True).iloc[:2, 11:]

Unnamed: 0,season_2,season_3,season_4
0,0,0,0
1,0,0,0


### Merge

In [65]:
df = pd.read_csv(r"C:\Users\nuguz\Downloads\data\bike.csv")
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [75]:
df1 = df.iloc[:3, :4]
df2 = df.iloc[5:8, :4]
pd.concat([df1, df2])

Unnamed: 0,index,datetime,season,holiday,workingday
0,5,2011-01-01 05:00:00,1,0,0
1,6,2011-01-01 06:00:00,1,0,0
2,7,2011-01-01 07:00:00,1,0,0


In [71]:
pd.concat([df1, df2.reset_index(drop=True)], axis=1)

Unnamed: 0,datetime,season,holiday,workingday,datetime.1,season.1,holiday.1,workingday.1
0,2011-01-01 00:00:00,1,0,0,2011-01-01 05:00:00,1,0,0
1,2011-01-01 01:00:00,1,0,0,2011-01-01 06:00:00,1,0,0
2,2011-01-01 02:00:00,1,0,0,2011-01-01 07:00:00,1,0,0


In [76]:
df_A = pd.read_csv(r"C:\Users\nuguz\Downloads\data\join_data_group_members.csv")
df_B = pd.read_csv(r"C:\Users\nuguz\Downloads\data\join_data_member_room.csv")

In [85]:
df_A.head(2)

Unnamed: 0,team,No,member
0,TWICE,1414,미나
1,TWICE,1462,나연


In [81]:
df_B.head(2)

Unnamed: 0,floor,room,name
0,3,3G-A,미나
1,3,3G-B,나연


In [79]:
pd.merge(left=df_A, right=df_B, how="inner", left_on="member", right_on="name").head(2)

Unnamed: 0,team,No,member,floor,room,name
0,TWICE,1414,미나,3,3G-A,미나
1,TWICE,1462,나연,3,3G-B,나연


## EDA

### 상관분석

- pearson: 수치형, 연속형
- kendall, spearman: 순서형

In [87]:
from scipy.stats import pearsonr, spearmanr, kendalltau

In [89]:
df = pd.read_csv(r"C:\Users\nuguz\Downloads\data\bike.csv")
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [92]:
df.loc[:, ["temp", "atemp", "casual","count", "registered"]].corr()

Unnamed: 0,temp,atemp,casual,count,registered
temp,1.0,0.984948,0.467097,0.394454,0.318571
atemp,0.984948,1.0,0.462067,0.389784,0.314635
casual,0.467097,0.462067,1.0,0.690414,0.49725
count,0.394454,0.389784,0.690414,1.0,0.970948
registered,0.318571,0.314635,0.49725,0.970948,1.0


In [95]:
stat, pval = pearsonr(df["registered"], df["count"]) #pval 0.05

(0.9709481058098284, 0.0)

### 비계층적 군집분석

In [96]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [97]:
df = pd.read_csv(r"C:\Users\nuguz\Downloads\data\iris.csv")
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [103]:
model = MinMaxScaler().fit(X=df.iloc[:, :-1])
norms = model.transform(df.iloc[:, :-1])
pd.DataFrame(norms, columns=df.columns[:-1]).head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667


In [108]:
model = KMeans(n_clusters=3, random_state=123).fit(df.iloc[:, :-1])
model.labels_ # array([1,1,1 ...])
model.cluster_centers_ # array([[1,2,3,4], [1,2,3,4], ...])
df["cluster"] = model.labels_
df["cluster"].value_counts()

2    62
1    50
0    38
Name: cluster, dtype: int64

### Evaluation

In [138]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [123]:
mean_absolute_error(y_true=df["PL"], y_pred=df["PW"])
mean_squared_error(y_true=df["PL"], y_pred=df["PW"])
mean_squared_error(y_true=df["PL"], y_pred=df["PW"]) ** 0.5

### 단순회귀분석

In [109]:
from statsmodels.formula.api import ols

In [110]:
df = pd.read_csv(r"C:\Users\nuguz\Downloads\data\iris.csv")
df.columns = ["SL", "SW", "PL", "PW", "species"]
df.head(2)

Unnamed: 0,SL,SW,PL,PW,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [112]:
model = ols(formula="PL ~ PW", data=df).fit()
model.summary()

0,1,2,3
Dep. Variable:,PL,R-squared:,0.927
Model:,OLS,Adj. R-squared:,0.927
Method:,Least Squares,F-statistic:,1882.0
Date:,"Tue, 24 Jan 2023",Prob (F-statistic):,4.6800000000000005e-86
Time:,21:07:26,Log-Likelihood:,-101.18
No. Observations:,150,AIC:,206.4
Df Residuals:,148,BIC:,212.4
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0836,0.073,14.850,0.000,0.939,1.228
PW,2.2299,0.051,43.387,0.000,2.128,2.332

0,1,2,3
Omnibus:,2.438,Durbin-Watson:,1.43
Prob(Omnibus):,0.295,Jarque-Bera (JB):,1.966
Skew:,0.211,Prob(JB):,0.374
Kurtosis:,3.369,Cond. No.,3.7


R-squared: 0.4 이상이면 괜찮은 모델

coef: a * x + b 에서 a값 (계수)

P>|t|: 유의확률, 95% 신뢰도를 가져야 유의하다고 판단 (0.05보다 작으면 유의미하다)

Df Model: 예측변수 개수

Durbin-Watson: 독립성 (1.5~2.5면 독립으로 판단)

In [114]:
model.predict(df)[:2]

0    1.529546
1    1.529546
dtype: float64

In [115]:
from sklearn.linear_model import LinearRegression

In [118]:
model = LinearRegression().fit(X = df[["PL"]],
                               y = df["PW"])
model.predict(df[["PL"]])[:2]

array([0.21898206, 0.21898206])

### 다중회귀분석

In [124]:
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [126]:
df = pd.read_csv(r"C:\Users\nuguz\Downloads\data\bike.csv")
df = df.loc[:, "season":"casual"]
df.head(2)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual
0,1,0,0,1,9.84,14.395,81,0.0,3
1,1,0,0,1,9.02,13.635,80,0.0,8


In [129]:
formula = "casual ~ " + "+".join(df.columns[:-1])
y, X = dmatrices(formula_like=formula, data=df, return_type="dataframe")

In [134]:
df_vif = pd.DataFrame()
df_vif["colname"] = X.columns
df_vif["VIF"] = [vif(X.values, i) for i in range(X.shape[1])]
df_vif # 값이 10이 넘으면 다중공선성이 있다는 것 (값 제거해야함)

Unnamed: 0,colname,VIF
0,Intercept,34.029472
1,season,1.137211
2,holiday,1.069731
3,workingday,1.071196
4,weather,1.23615
5,temp,35.516012
6,atemp,35.550831
7,humidity,1.425034
8,windspeed,1.195704


In [136]:
formula = "casual ~ " + "+".join(df.columns[:-1])
model = ols(formula=formula, data=df).fit()
model.summary()

0,1,2,3
Dep. Variable:,casual,R-squared:,0.439
Model:,OLS,Adj. R-squared:,0.439
Method:,Least Squares,F-statistic:,1064.0
Date:,"Tue, 24 Jan 2023",Prob (F-statistic):,0.0
Time:,21:26:29,Log-Likelihood:,-54877.0
No. Observations:,10886,AIC:,109800.0
Df Residuals:,10877,BIC:,109800.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,46.8575,2.093,22.389,0.000,42.755,50.960
season,2.0324,0.343,5.929,0.000,1.360,2.704
holiday,-12.6947,2.227,-5.699,0.000,-17.061,-8.329
workingday,-37.1780,0.797,-46.671,0.000,-38.739,-35.617
weather,3.3336,0.629,5.297,0.000,2.100,4.567
temp,1.3288,0.274,4.842,0.000,0.791,1.867
atemp,1.4260,0.252,5.649,0.000,0.931,1.921
humidity,-0.9104,0.022,-40.908,0.000,-0.954,-0.867
windspeed,0.0582,0.048,1.211,0.226,-0.036,0.152

0,1,2,3
Omnibus:,4414.734,Durbin-Watson:,0.221
Prob(Omnibus):,0.0,Jarque-Bera (JB):,29504.708
Skew:,1.808,Prob(JB):,0.0
Kurtosis:,10.209,Cond. No.,464.0


### 로지스틱 회귀

In [139]:
from statsmodels.api import Logit
from sklearn.linear_model import LogisticRegression

In [141]:
df = pd.read_csv(r"C:\Users\nuguz\Downloads\data\iris.csv")
df["is_setosa"] = (df["Species"]=="setosa") + 0
df.head(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species,is_setosa
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1


In [145]:
model = Logit(endog=df["is_setosa"],
              exog=df.iloc[:, :2]).fit()
model.params
pred = model.predict(df.iloc[:, :2]) > 0.5 # 확률값 예측

Optimization terminated successfully.
         Current function value: 0.036374
         Iterations 11


In [147]:
model = LogisticRegression().fit(X=df.iloc[:, :2],
                                 y=df["is_setosa"])
pred = model.predict_proba(df.iloc[:, :2])

### 나이브베이즈

In [148]:
from sklearn.naive_bayes import GaussianNB

In [149]:
model = GaussianNB().fit(X=df.iloc[:, :4],
                         y=df["is_setosa"])
model.class_prior_ # 전사확률
model.theta_
pred = model.predict_proba(df.iloc[:, :4])

### KNN

In [150]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

In [153]:
model = KNeighborsClassifier(n_neighbors=3).fit(X=df.iloc[:, :4],
                                                y=df["is_setosa"])
pred = model.predict(df.iloc[:, :4])

### 의사결정나무

In [155]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [157]:
model = DecisionTreeClassifier(random_state=123).fit(X=df.iloc[:, :4],
                                                     y=df["is_setosa"])
pred = model.predict(df.iloc[:, :4])