# 2022.09.05.(화)
---
##### [1] language 
- 목적    : 알파벳(a-z)을 사용하는 언어 식별
- 데이터  : 알파벳 사용하는 4개국의 테스트 파일
- 전제조건: 나라별 자주 사용되는 알파벳이 다름

(1) 전처리   
- 나라별 알파벳 패턴 분석  
  
(2) 학습   
- 교차 검증 적용  

In [20]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.utils import all_estimators
from sklearn.model_selection import train_test_split

path_train = './lang/train/'
path_test  = './lang/test/'

# 폴더 내 모든 파일 불러오기
dir_list = os.listdir(path_train)
dir_list

['en-1.txt',
 'en-2.txt',
 'en-3.txt',
 'en-4.txt',
 'en-5.txt',
 'fr-10.txt',
 'fr-6.txt',
 'fr-7.txt',
 'fr-8.txt',
 'fr-9.txt',
 'id-11.txt',
 'id-12.txt',
 'id-13.txt',
 'id-14.txt',
 'id-15.txt',
 'tl-16.txt',
 'tl-17.txt',
 'tl-18.txt',
 'tl-19.txt',
 'tl-20.txt']

In [21]:
with open(path_train + 'en-1.txt', mode ='r', encoding='utf-8') as f:
    text = f.read()

# 모두 소문자로 변경    
text = text.lower()
text

'\n\n\n\nthe main henry ford museum building houses some of the classrooms for the henry ford academy\n\n\nhenry ford academy is the first charter school in the united states to be developed jointly by a global corporation, public education, and a major nonprofit cultural institution. the school is sponsored by the ford motor company, wayne county regional educational service agency and the henry ford museum and admits high school students. it is located in dearborn, michigan on the campus of the henry ford museum. enrollment is taken from a lottery in the area and totaled 467 in 2010.[1]\nfreshman meet inside the main museum building in glass walled classrooms, while older students use a converted carousel building and pullman cars on a siding of the greenfield village railroad. classes are expected to include use of the museum artifacts, a tradition of the original village schools. when the museum was established in 1929, it included a school which served grades kindergarten to colle

In [22]:
# 정규식으로 알파벳만 추출
import re
p = re.compile('[a-z]+')
alphabet_text_list = p.findall(text)
#print(alphabet_text_list)

# 리스트 -> 문자열로 변환
alphabet_text =' '.join(s for s in alphabet_text_list)
print(alphabet_text)

the main henry ford museum building houses some of the classrooms for the henry ford academy henry ford academy is the first charter school in the united states to be developed jointly by a global corporation public education and a major nonprofit cultural institution the school is sponsored by the ford motor company wayne county regional educational service agency and the henry ford museum and admits high school students it is located in dearborn michigan on the campus of the henry ford museum enrollment is taken from a lottery in the area and totaled in freshman meet inside the main museum building in glass walled classrooms while older students use a converted carousel building and pullman cars on a siding of the greenfield village railroad classes are expected to include use of the museum artifacts a tradition of the original village schools when the museum was established in it included a school which served grades kindergarten to college trade school ages the last part of the ori

In [23]:
# 알파벳 개수 세기
alphabet_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

for i in range(len(alphabet_list)):
    print(f'{alphabet_list[i]}의 갯수 : {alphabet_text.count(alphabet_list[i])}개')


a의 갯수 : 349개
b의 갯수 : 59개
c의 갯수 : 210개
d의 갯수 : 212개
e의 갯수 : 484개
f의 갯수 : 72개
g의 갯수 : 88개
h의 갯수 : 201개
i의 갯수 : 340개
j의 갯수 : 8개
k의 갯수 : 25개
l의 갯수 : 247개
m의 갯수 : 121개
n의 갯수 : 356개
o의 갯수 : 412개
p의 갯수 : 76개
q의 갯수 : 0개
r의 갯수 : 357개
s의 갯수 : 282개
t의 갯수 : 370개
u의 갯수 : 119개
v의 갯수 : 45개
w의 갯수 : 65개
x의 갯수 : 3개
y의 갯수 : 92개
z의 갯수 : 2개


## 알파벳 카운트 함수 생성
---

In [24]:
# 알파벳 개별 개수 카운팅 함수
def count_alphabet(file_name,path):

    # 파일 불러오기
    with open(path + file_name, mode ='r', encoding='utf-8') as f:
        text = f.read()

    # 모두 소문자로 변경    
    text = text.lower()

    # 정규식으로 알파벳만 추출
    p = re.compile('[a-z]+')
    alphabet_text_list = p.findall(text)
    #print(alphabet_text_list)

    # 리스트 -> 문자열로 변환
    alphabet_text =' '.join(s for s in alphabet_text_list)

    # 알파벳 개수 세기
    alphabet_list = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

    col = []
    for i in range(len(alphabet_list)):
        #print(f'{alphabet_list[i]}의 갯수 : {alphabet_text.count(alphabet_list[i])}개')
        col.append((alphabet_text.count(alphabet_list[i])))
        
    return(col)


In [35]:
# 각 파일의 알파벳 개수를 하나의 컬럼으로 만들기 -> 데이터프레임으로 생성
df=pd.DataFrame(columns = dir_list, index=alphabet_list)

for file_name in dir_list:
    col = count_alphabet(file_name,path_train)
    df[file_name] = col
df = df.T
df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,q,r,s,t,u,v,w,x,y,z
en-1.txt,349,59,210,212,484,72,88,201,340,8,...,0,357,282,370,119,45,65,3,92,2
en-2.txt,706,167,255,326,1149,146,262,230,632,22,...,46,756,601,652,257,115,117,17,90,5
en-3.txt,259,44,165,118,434,53,91,85,342,9,...,6,195,318,293,105,68,43,2,65,2
en-4.txt,383,147,159,210,642,89,125,313,346,8,...,2,314,389,497,129,27,104,32,93,9
en-5.txt,337,93,142,181,645,93,93,260,297,15,...,2,331,272,436,114,49,109,14,68,3
fr-10.txt,1228,163,581,831,2347,253,167,242,1082,36,...,92,1171,1117,1210,834,223,11,61,57,13
fr-6.txt,2162,414,992,1244,4155,327,319,350,1998,59,...,190,2074,2278,1958,1511,295,126,108,148,13
fr-7.txt,1230,221,599,761,2550,178,179,195,1208,51,...,112,1301,1325,1111,861,215,44,84,82,2
fr-8.txt,585,127,321,412,1266,105,109,116,690,35,...,47,661,536,492,344,122,6,41,40,16
fr-9.txt,405,59,181,266,735,62,103,100,461,20,...,24,399,371,325,225,72,2,17,27,6


In [26]:
#  target데이터 생성
target = ["en"] * 5 + ["fr"] * 5 + ['id'] * 5 +['tl'] * 5
target

['en',
 'en',
 'en',
 'en',
 'en',
 'fr',
 'fr',
 'fr',
 'fr',
 'fr',
 'id',
 'id',
 'id',
 'id',
 'id',
 'tl',
 'tl',
 'tl',
 'tl',
 'tl']

# 분석
---

##### [1] 학습

In [27]:
# 필터 타입에 해당하는 sklearn에 존재하는 모든 모델 이름과 객체 리스트로 반환
models = all_estimators(type_filter = 'classifier')

# 각 모델들 훈련시키고 정확도 추출
scores = []
for name, model in models:
    try:
        # 모델 객체 생성
        md = model()
        # 학습
        md.fit(df, target)
        # 평가
        result = md.score(df, target)
    
        scores.append((name, result))
    except:
        pass

  classes = classes[classes != -1]
  unlabeled = y == -1
  classes = classes[classes != -1]
  unlabeled = y == -1
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://sc

In [28]:
# 각 분류기 모듈명과 score
scores

[('AdaBoostClassifier', 0.65),
 ('BaggingClassifier', 1.0),
 ('BernoulliNB', 0.45),
 ('CalibratedClassifierCV', 1.0),
 ('CategoricalNB', 1.0),
 ('ComplementNB', 0.65),
 ('DecisionTreeClassifier', 1.0),
 ('DummyClassifier', 0.25),
 ('ExtraTreeClassifier', 1.0),
 ('ExtraTreesClassifier', 1.0),
 ('GaussianNB', 0.85),
 ('GaussianProcessClassifier', 1.0),
 ('GradientBoostingClassifier', 1.0),
 ('HistGradientBoostingClassifier', 0.25),
 ('KNeighborsClassifier', 0.65),
 ('LinearDiscriminantAnalysis', 1.0),
 ('LinearSVC', 1.0),
 ('LogisticRegression', 1.0),
 ('LogisticRegressionCV', 1.0),
 ('MLPClassifier', 1.0),
 ('MultinomialNB', 0.95),
 ('NearestCentroid', 0.7),
 ('NuSVC', 0.7),
 ('PassiveAggressiveClassifier', 1.0),
 ('Perceptron', 1.0),
 ('QuadraticDiscriminantAnalysis', 1.0),
 ('RadiusNeighborsClassifier', 1.0),
 ('RandomForestClassifier', 1.0),
 ('RidgeClassifier', 1.0),
 ('RidgeClassifierCV', 1.0),
 ('SGDClassifier', 1.0),
 ('SVC', 0.7)]

In [29]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

In [30]:
# SGDClassifier(), LogisticRegression()로 검정
model_sgd = SGDClassifier(random_state=42)
model_sgd.fit(df,target)

model_LR = LogisticRegression(max_iter=500,random_state=42)
model_LR.fit(df,target)


In [31]:
test_list = os.listdir(path_test)
test_list

['en-1.txt',
 'en-2.txt',
 'fr-3.txt',
 'fr-4.txt',
 'id-5.txt',
 'id-6.txt',
 'tl-7.txt',
 'tl-8.txt']

# 테스트 데이터로 검증

In [32]:
print('[ SGDClassifier 예측 ]')
for file in test_list:
    alphabet_data = count_alphabet(file,path_test)
    arr_data = np.array(alphabet_data).reshape(1,26)
    print(model_sgd.predict(arr_data))

print()

print('[ LogisticRegression 예측 ]')
for file in test_list:
    alphabet_data = count_alphabet(file,path_test)
    arr_data = np.array(alphabet_data).reshape(1,26)
    print(model_LR.predict(arr_data))   

[ SGDClassifier 예측 ]
['en']
['en']
['fr']
['fr']
['id']
['id']
['tl']
['tl']

[ LogisticRegression 예측 ]
['en']
['en']
['en']
['fr']
['id']
['id']
['tl']
['tl']


## 교차검정
---

In [33]:
# 5-Fold로 5등분으로 나누어서 학습/검증 모델 생성 진행
result_LR = cross_validate(model_LR, df, target, return_train_score= True)
result_sgd = cross_validate(model_sgd, df, target, return_train_score= True)

print(result_LR)
print(result_sgd)

{'fit_time': array([0.01299572, 0.02717876, 0.01299214, 0.02729321, 0.02474046]), 'score_time': array([0.00099945, 0.00105166, 0.0020411 , 0.00100064, 0.00100088]), 'test_score': array([1.  , 1.  , 1.  , 1.  , 0.75]), 'train_score': array([1., 1., 1., 1., 1.])}
{'fit_time': array([0.00300264, 0.00307846, 0.00307965, 0.00200915, 0.00200129]), 'score_time': array([0.00099468, 0.00092077, 0.00092077, 0.000911  , 0.00107622]), 'test_score': array([1., 1., 1., 1., 1.]), 'train_score': array([1.    , 1.    , 1.    , 0.9375, 1.    ])}


In [34]:
print('[ LogisticRegression 점수 ]')
print('LR_train score:',result_LR['train_score'].mean())
print('LR_test score:',result_LR['test_score'].mean())

print()

print('[ SGDClassifier 점수 ]')
print('sgd_train score:',result_sgd['train_score'].mean())
print('sgd_test score:',result_sgd['test_score'].mean())

[ LogisticRegression 점수 ]
LR_train score: 1.0
LR_test score: 0.95

[ SGDClassifier 점수 ]
sgd_train score: 0.9875
sgd_test score: 1.0
