## description 으로 points 분류

In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('winemag-data_first150k.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
1,1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
4,4,France,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,Provence red blend,Domaine de la Bégude


* 학습 가능 조건 : X는 데이터프레임이어도 괜찮음 /  y는 1차원(array, series 등)

In [3]:
df_X = df[['description']]
series_y = df['points']

In [4]:
df_X.description[0]

'This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030.'

In [5]:
text_list = []
for i in range(len(df_X)):
    text = df_X['description'][i]
    text = re.sub('[^\w\s]', '', text) # \s(공백 문자)
    #[\W\s] 했더니, 단어 간 띄어쓰기가 없어짐ㅋㅋ  
    text = re.sub('\d', '', text)
    text = re.sub('[ \t]+' , ' ', text) # 공백 제거
    text_list.append(text)

In [6]:
text_list[1:10]

['Ripe aromas of fig blackberry and cassis are softened and sweetened by a slathering of oaky chocolate and vanilla This is full layered intense and cushioned on the palate with rich flavors of chocolaty black fruits and baking spices A toasty everlasting finish is heady but ideally balanced Drink through ',
 'Mac Watson honors the memory of a wine once made by his mother in this tremendously delicious balanced and complex botrytised white Dark gold in color it layers toasted hazelnut pear compote and orange peel flavors reveling in the succulence of its gL of residual sugar',
 'This spent months in new French oak and incorporates fruit from Ponzis Aurora Abetina and Madrona vineyards among others Aromatic dense and toasty it deftly blends aromas and flavors of toast cigar box blackberry black cherry coffee and graphite Tannins are polished to a fine sheen and frame a finish loaded with dark chocolate and espresso Drink now through ',
 'This is the top wine from La Bégude named after t

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(min_df=0., max_df=1.0)

In [8]:
X = vect.fit_transform(text_list) 

In [9]:
type(X)

scipy.sparse.csr.csr_matrix

In [10]:
X.shape

(150930, 42965)

In [11]:
df.shape

(150930, 11)

In [12]:
df_X.description.head()

0    This tremendous 100% varietal wine hails from ...
1    Ripe aromas of fig, blackberry and cassis are ...
2    Mac Watson honors the memory of a wine once ma...
3    This spent 20 months in 30% new French oak, an...
4    This is the top wine from La Bégude, named aft...
Name: description, dtype: object

In [13]:
import numpy as np
y = np.array(series_y)

In [14]:
y

array([96, 96, 96, ..., 91, 90, 90], dtype=int64)

# Train, Test set 나누기


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(X, test_size=0.25, random_state=0)
X_train.shape, X_test.shape

((113197, 42965), (37733, 42965))

In [16]:
y_train, y_test = train_test_split(y, test_size=0.25, random_state=0)
y_train.shape, y_test.shape

((113197,), (37733,))

# 1. 베르누이 분포 나이브 베이즈

In [17]:
from sklearn.naive_bayes import BernoulliNB
model_bern = BernoulliNB()

In [18]:
model_bern.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [19]:
pre = model_bern.predict(X_test)

In [20]:
print((pre == y_test).mean())

0.3181300188164206


## 교차검증

In [21]:
from sklearn.model_selection import cross_val_score

In [22]:
cross_val_score(model_bern, X, y, scoring="accuracy", cv=10).mean() ##분류이므로 scoring = accuracy, 회기는 r2

0.33818679293038134

In [23]:
# cv = 1000 일 때, 0.34 정도

# 2. 가우시안 정규 분포 나이브 베이즈

* A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

* One-Hot-Encoding 결과는 메모리 절약을 위해 스파스 행렬(sparse matrix) 형식으로 출력된다. 일반적인 배열로 바꾸려면 toarray 메서드를 사용

* https://datascienceschool.net/view-notebook/c19b48e3c7b048668f2bb0a113bd25f7/

In [24]:
# print(X)

In [25]:
# X = X.todense

In [26]:
# print(X)

In [27]:
# from sklearn.naive_bayes import GaussianNB
# #가우시안은 sparse(X) but dens(Y)
# model_G = GaussianNB().fit(X,y)

# 3. 다항 분포 나이브 베이즈
* https://datascienceschool.net/view-notebook/c19b48e3c7b048668f2bb0a113bd25f7/

In [28]:
from sklearn.naive_bayes import MultinomialNB
model_M = MultinomialNB().fit(X,y)

In [29]:
pre = model_M.predict(X_test)

In [30]:
print((pre == y_test).mean())

0.4594121856200143


# 다른 사람 아이디어

* Pandas에서 배열의 합계나 평균같은 일반적인 통계는 DataFrame내 함수를 사용하면 되지만, 

* Pandas에서 제공하지 않는 기능, 즉 내가 만든 커스텀 함수(custom function)를 DataFrame에 적용하려면 

* map함수, apply함수, applymap함수를 사용하면 된다

* ------------------------------------------------------ 

* 점수를 구간으로 나눔
* 80, 81, 82, 83, 84, 얘네를 다 B 구간에 넣으면 1/20 -> 1/5 
* B A S SS 넷중에 하나만 들어가면 되니까! 정확도가 올라감

* def rank_estimator(score):
    
    if (85 >= score >= 80):
        return "B"
    elif (90 >= score > 85):
        return "A"
    elif (95 >= score > 90):
        return "S"
    else:
        return "SS"

* data_test.points = data_test.points.apply(rank_estimator)
* data_test.head()

* --> 모든 열에 rank_estimator 함수가 적용됨
* --> 위에서 정의한 함수의 매개변수(score)는 
* --> 자동적으로 / dataframe의 data(values)가 됨
* --> apply 함수는 모든 컬럼에 적용됨. df_y는 컬럼이 1개라서!