<a href="https://colab.research.google.com/github/junkyuhufs/Practice/blob/main/GNU_Session2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#AI, Digital Literacy, and Convergence Education: Connecting Humanities and Technology

##자연어처리와 디지털인문학 따라하기

###Junkyu Lee (Hankuk University of Foreign Studies)

In [None]:
#@markdown Introduction Slides (1~8)
from IPython.display import display
import ipywidgets as widgets
import requests

def on_button_click(button):
    sn = int(button.description) - 1
    image.value = requests.get(urls[sn]).content

urls = ["https://raw.githubusercontent.com/junkyuhufs/Practice/main/slide.01.JPG",
        "https://raw.githubusercontent.com/junkyuhufs/Practice/main/slide.03.JPG",
        "https://raw.githubusercontent.com/junkyuhufs/Practice/main/slide.02.JPG",
        "https://raw.githubusercontent.com/junkyuhufs/Practice/main/slide.05.JPG",
        "https://raw.githubusercontent.com/junkyuhufs/Practice/main/slide.04.JPG",
        "https://raw.githubusercontent.com/junkyuhufs/Practice/main/slide.06.JPG",
        "https://raw.githubusercontent.com/junkyuhufs/Practice/main/slide.07.JPG",
        "https://raw.githubusercontent.com/junkyuhufs/Practice/main/slide.08.JPG"
]

button_layout = widgets.Layout(width='50px', height='30px')

buttons = [widgets.Button(description=str(i), layout=button_layout) for i in range(1, 9)]
for button in buttons:
    button.on_click(on_button_click)

image = widgets.Image(value=requests.get(urls[0]).content, width="700", height="600")

display(widgets.HBox([image, widgets.VBox(buttons)]))

#Let's start with Python
##Preprocessing examples

In [None]:
#@markdown Preprocess & Python library Slides (9~11)
from IPython.display import display
import ipywidgets as widgets
import requests

def on_button_click(button):
    sn = int(button.description) - 1
    image.value = requests.get(urls[sn]).content

urls = ["https://raw.githubusercontent.com/junkyuhufs/Practice/main/slide.17.JPG",
        "https://raw.githubusercontent.com/junkyuhufs/Practice/main/slide.18.JPG",
        "https://raw.githubusercontent.com/junkyuhufs/Practice/main/slide.11.JPG"
]

button_layout = widgets.Layout(width='50px', height='30px')

buttons = [widgets.Button(description=str(i), layout=button_layout) for i in range(1, 4)]
for button in buttons:
    button.on_click(on_button_click)

image = widgets.Image(value=requests.get(urls[0]).content, width="700", height="600")

display(widgets.HBox([image, widgets.VBox(buttons)]))

In [8]:
#@markdown Import/Install relevant packages
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
#@markdown "The rain in Spain falls mainly on the plain." 전처리
text = "The rain in Spain falls mainly on the plain."
doc = nlp(text)

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.is_stop)

import pandas as pd

cols = ("text", "lemma", "POS", "explain", "stopword")
rows = []

for t in doc:
    row = [t.text, t.lemma_, t.pos_, spacy.explain(t.pos_), t.is_stop]
    rows.append(row)

df = pd.DataFrame(rows, columns=cols)
    
df

In [None]:
#@markdown Sentence 시각화
from spacy import displacy

displacy.render(doc, style="dep", jupyter=True)

# Topic modeling w/ 미국 대통령 연설문

In [None]:
#@markdown LDA Slides (9~11)
from IPython.display import display
import ipywidgets as widgets
import requests

def on_button_click(button):
    sn = int(button.description) - 1
    image.value = requests.get(urls[sn]).content

urls = ["https://raw.githubusercontent.com/junkyuhufs/Practice/main/slide.09.JPG",
        "https://raw.githubusercontent.com/junkyuhufs/Practice/main/slide.10.JPG"
]

button_layout = widgets.Layout(width='50px', height='30px')

buttons = [widgets.Button(description=str(i), layout=button_layout) for i in range(1, 3)]
for button in buttons:
    button.on_click(on_button_click)

image = widgets.Image(value=requests.get(urls[0]).content, width="700", height="600")

display(widgets.HBox([image, widgets.VBox(buttons)]))

### Downloading necessary files

* **state-of-the-union.csv:** State of the Union addresses - each presidential address from 1970 to 2012

[Topic modeling 실습파일](https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/text-analysis/data/state-of-the-union.csv)

In [None]:
#@markdown Download the csv file
# Make data directory if it doesn't exist
!mkdir -p data
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/text-analysis/data/state-of-the-union.csv -P data

In [None]:
#@markdown Covert csv to Pandas & data cleaning
import pandas as pd
df = pd.read_csv("data/state-of-the-union.csv")
# Clean it up a little bit, removing non-word characters (numbers and ___ etc)
df.content = df.content.str.replace("[^A-Za-z ]", " ")
df.head()

In [None]:
#@markdown Explore data w/ wordcloud
# Import the wordcloud library
from wordcloud import WordCloud
# Join the different processed titles together.
long_string = ','.join(list(df.content.values))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()

## Prepare data for LDA (Latent Dirichlet Allocation; 잠재 디리클레할당)
###gensim.utils.simple_preprocess convert a document into a list of tokens. This lowercases, tokenizes, de-accents (optional)

In [None]:
#@markdown gensim의 simple_preprocess 이용한 토큰화
!pip install gensim
import gensim
from gensim.utils import simple_preprocess
df.content = df.content.apply(simple_preprocess)

In [None]:
#@markdown stopwords 제거
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# stop_words.extend(['from', 'to']) # add more if want
df.content = df.content.apply(lambda words: [word for word in words if word not in stop_words])

In [None]:
#@markdown 토큰화 > 카운트벡터 (BOW형태로 변환) 형태인 corpus 생성; 첫번째 결과 확인
texts = df.content #Gensim에서는 토큰화된 결과를 texts로 지정해야 함
from gensim import corpora
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=5, no_above=0.5) #출현한 문서 빈도수가 낮거나 (문서에서 5번 이하) or 높은 단어들 (0.5 -> 50%이상) 제외 
corpus = [dictionary.doc2bow(text) for text in texts] #doc2bow() >> 토큰화된 결과를 카운트 벡터, 즉 BOW형태로 변환; Gensim에서는 doc2bow()의 결과를 corpus로 지정해야 함
corpus[0]

In [None]:
#@markdown 카운트 벡터형태의 corpus를 TF-IDF로 변환; 첫 번째 결과 확인
from gensim import models
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
corpus_tfidf[0]

In [None]:
#LDA(잠재 디리클레할당) 실행; 토픽수 = 15개 지정
from gensim import models
n_topics = 15
lda_model = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=n_topics)

In [None]:
lda_model.print_topics()

In [None]:
#@markdown LDA결과 시각화
!pip install pyLDAvis
!pip install "pandas<2.0.0" 

import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis