### 패키지 설치 및 임포트

In [1]:
# pip install pandas
# pip install matplotlib
# pip install gensim
# pip install scikit-learn

In [2]:
import urllib.request
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import re
from PIL import Image
from io import BytesIO
# from gensim.models import Word2Vec
# from gensim.models import KeyedVectors
# from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

### 데이터 불러와서 df 변환

In [58]:
def read_json_to_df(file_path):
    try:
        # JSON 파일을 DataFrame으로 읽어오기
        df = pd.read_json(file_path)
        return df
    except FileNotFoundError:
        print("File not found.")
        return None

# JSON 파일 경로 설정
json_file_path_1 = 'category/article(0308).json'
json_file_path_2 = 'category/article(0319).json'

# JSON 파일을 DataFrame으로 변환하여 출력
df1 = read_json_to_df(json_file_path_1)
df1.drop(df1[(df1['category'] == 'us') | (df1['category'] == 'world')].index, inplace=True)

df2 = read_json_to_df(json_file_path_2)

df = pd.concat([df1, df2])
if df is not None:
    print(df)

                                                 title  category  \
97                   3 very different border realities  politics   
98   Biden directs US military to establish aid por...  politics   
99   Jill Biden’s State of the Union guests include...  politics   
100  Supreme Court is under pressure to step into t...  politics   
101  Biden projects a vision of strength that’s bee...  politics   
..                                                 ...       ...   
316  Despite being fired earlier this week, Long Be...    sports   
317  Dallas Cowboys quarterback Dak Prescott being ...    sports   
318  Mo Salah sets Liverpool goalscoring record in ...    sports   
319  Lionel Messi and Luis Suárez roll back the yea...    sports   
320  ‘When I see Russian athletes … I see every cit...    sports   

                                                  text  
97   Three completely different border realities we...  
98   As President Joe Biden addressed Congress and ...  
99   An Alab

### 기사 텍스트 데이터 클리닝

In [59]:
from nltk.tokenize import RegexpTokenizer

def make_lower_case(text):
    return text.lower()

# 구두문자 제거
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

df['cleaned_text'] = df.text.apply(make_lower_case)
df['cleaned_text'] = df.cleaned_text.apply(remove_punctuation)

df['cleaned_text']

97     three completely different border realities we...
98     as president joe biden addressed congress and ...
99     an alabama woman seeking in vitro fertilizatio...
100    the supreme court is facing intense pressure t...
101    this version of joe biden could beat donald tr...
                             ...                        
316    the long beach state beach men s basketball te...
317    days after being sued for allegedly trying to ...
318    as bad days at the office go it s unlikely spa...
319    lionel messi and luis su rez rolled back the y...
320    at some point last year yaroslava mahuchikh vo...
Name: cleaned_text, Length: 645, dtype: object

## 모델 훈련

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

### 데이터셋 로드

In [60]:
X = df['cleaned_text'].tolist()
y = df['category'].tolist()

### 카테고리, 텍스트 전처리

In [61]:
# 카테고리 정수 변환
categories = df['category'].unique()

# 각 카테고리에 고유한 정수 인덱스를 지정하기 위한 매핑 딕셔너리 생성
category_mapping = {category: index for index, category in enumerate(categories)}

# 각 카테고리를 정수 인덱스로 변환
y_encoded = np.array([category_mapping[category] for category in y])

In [62]:
# 텍스트 데이터 전처리
# 딥러닝 위해 패딩 처리
max_length = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=max_length, padding='post', truncating='post')

# 데이터셋 분할
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.3, random_state=42)

### 하이퍼파라미터 결정

In [63]:
def get_longest_word_count(sentences):
    # 각 문장의 길이를 측정하여 리스트에 저장합니다.
    sentence_lengths = [len(sentence.split()) for sentence in sentences]

    # 최대 길이를 갖는 문장의 인덱스를 찾습니다.
    max_length_index = sentence_lengths.index(max(sentence_lengths))

    # 최대 길이를 갖는 문장을 가져옵니다.
    longest_sentence = sentences[max_length_index]

    # 가장 긴 문장의 단어 수를 카운트합니다.
    longest_sentence_word_count = len(set(longest_sentence.split()))

    print("가장 긴 문장의 단어 수:", longest_sentence_word_count)

get_longest_word_count(X)

가장 긴 문장의 단어 수: 1543


In [70]:
# 모델 구성
vocab_size = 3000
embedding_dim = 300

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(7, activation='softmax')
])

# 모델 컴파일
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 모델 훈련
model.fit(X_train, y_train, epochs=45, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/45
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.1867 - loss: 1.9390 - val_accuracy: 0.1856 - val_loss: 1.9092
Epoch 2/45
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2365 - loss: 1.8908 - val_accuracy: 0.1959 - val_loss: 1.8576
Epoch 3/45
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2440 - loss: 1.8412 - val_accuracy: 0.2216 - val_loss: 1.8047
Epoch 4/45
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2587 - loss: 1.7689 - val_accuracy: 0.3041 - val_loss: 1.7266
Epoch 5/45
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4366 - loss: 1.6655 - val_accuracy: 0.4948 - val_loss: 1.6455
Epoch 6/45
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6633 - loss: 1.4773 - val_accuracy: 0.5155 - val_loss: 1.5131
Epoch 7/45
[1m15/15[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x1bc0222cda0>

In [71]:
# 모델 평가
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7724 - loss: 0.6529 
Test Accuracy: 0.7680412530899048


## 테스트

### 데이터 가져오기

In [50]:
import pandas as pd

json_file_path = 'CNNect.data(full).json'

df = read_json_to_df(json_file_path)
df.drop(columns=df.columns.difference(['full_script', 'video_id', 'title']), inplace=True)

if df is not None:
    print(df)

          video_id                                        full_script
0      jsxI0QjxJs8  Here with me now is Israeli Prime Minister Ben...
1      rGMqBoF3hkU  After severe storms and tornadoes hit multiple...
2      2keAv3W3edw  As you know, it's been a tough week for the pr...
3      dyXBfjwu4fA  Are looking at images seen earlier in Trenton,...
4      7ozPfsLRxZw  'M  JOHN BETTER, BUT POUNDS HAD  THIS IS CNN  ...
...            ...                                                ...
35387  HGQct9tFpm0  CALLS FOR BLOOD ARE GETTING  LOUDER AND LOUDER...
35388  nLtnU20ZVO0  OF YOU SOME INFORMATION ABOUT  WHAT WE'RE DOIN...
35389  vmxgRP7OZf8  MOSTLY A CLANDESTINE PRESENCE  OPERATED BY THE...
35390  34-cY8wIrcw  AND HELPING THEIR DREAMS TO COME TRUE. AND WE ...
35391  -aWdR1z1HfI  bjbjLULU Joining us by telephone tonight is\np...

[35392 rows x 2 columns]


### 전처리 : 토큰화 및 패딩

In [51]:
get_longest_word_count(df['full_script'].tolist())

가장 긴 문장의 단어 수: 709


In [19]:
print(category_mapping)

{'politics': 0, 'business': 1, 'health': 2, 'entertainment': 3, 'style': 4, 'travel': 5, 'sports': 6}


In [52]:
df['cleaned_script'] = df.full_script.apply(make_lower_case)
df['cleaned_script'] = df.cleaned_script.apply(remove_punctuation)

df['cleaned_script']

0        here with me now is israeli prime minister ben...
1        after severe storms and tornadoes hit multiple...
2        as you know it s been a tough week for the pro...
3        are looking at images seen earlier in trenton ...
4        m john better but pounds had this is cnn what ...
                               ...                        
35387    calls for blood are getting louder and louder ...
35388    of you some information about what we re doing...
35389    mostly a clandestine presence operated by the ...
35390    and helping their dreams to come true and we r...
35391    bjbjlulu joining us by telephone tonight is pi...
Name: cleaned_script, Length: 35392, dtype: object

In [35]:
from tabulate import tabulate

In [53]:
max_length = 1000
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['cleaned_script'])
X_sequences = tokenizer.texts_to_sequences(df['cleaned_script'])
X_padded = pad_sequences(X_sequences, maxlen=max_length, padding='post', truncating='post')

# 예측 수행
predictions = model.predict(X_padded)

# 예측 결과를 DataFrame에 새로운 열로 추가
df['predictions'] = [np.argmax(pred) for pred in predictions]


[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step


In [56]:
# 숫자에서 문자로 카테고리 변환을 위한 딕셔너리 생성
category_reverse_mapping = {index: category for category, index in category_mapping.items()}

# 예측 결과를 문자로 변환하여 새로운 열에 추가
df['predicted_category'] = df['predictions'].map(category_reverse_mapping)

In [41]:
print(tabulate(df[['video_id', 'predicted_category']][:100], headers='keys', tablefmt='pretty'))

+----+-------------+--------------------+
|    |  video_id   | predicted_category |
+----+-------------+--------------------+
| 0  | jsxI0QjxJs8 |      business      |
| 1  | rGMqBoF3hkU |       travel       |
| 2  | 2keAv3W3edw |      politics      |
| 3  | dyXBfjwu4fA |      politics      |
| 4  | 7ozPfsLRxZw |   entertainment    |
| 5  | Q1KbKe2eJaY |      business      |
| 6  | dwjJS2p0w4A |      business      |
| 7  | ibBNhhhTgMk |      politics      |
| 8  | dvccoSqV8NE |      business      |
| 9  | 3eh5nCYG7k8 |      politics      |
| 10 | xqTaUR0UsqI |      business      |
| 11 | NsqlJ9HnuLc |      politics      |
| 12 | xifyyLL500w |      business      |
| 13 | 1TijdSYu6Kc |      business      |
| 14 | aEDLxivVPTY |      politics      |
| 15 | XrICFSK3rA8 |      politics      |
| 16 | 9BQT7odQwgw |      business      |
| 17 | dRFXqGSMRPY |      business      |
| 18 | Bl0Imc6nDrk |      politics      |
| 19 | oGxqtJY48k0 |      business      |
| 20 | FtE414zkSX0 |   entertainme

In [57]:
prediction_counts = df['predicted_category'].value_counts()
print(prediction_counts)


predicted_category
business         22933
health            7987
politics          2957
style              592
travel             589
sports             308
entertainment       26
Name: count, dtype: int64
