### rate: 미국 국채
### df: FOMC, Interview, Speech merge data
### news_df: NEWS
### is_df: interview + speech
### fomc_df: FOMC 회의록
### data_df: 금리가 있는 날짜에 맞춰, df 정리한 데이터 -> 최종 데이터

##### 뉴스 데이터 합치기

In [41]:
import pandas as pd
import os

# 엑셀 파일 병합하기
filepath = '/content/drive/MyDrive/Colab Notebooks/Unclock-watchers/news/'

# 파일 경로
file_list = os.listdir(filepath)
# 경로에 있는 모든 csv 파일 리스트 불러오기
file_list_csv = [file for file in file_list if file.endswith('.csv')]

# 저장할 dataframe
merged_df = pd.DataFrame()

for file in file_list_csv:
    # 파일의 경로
    file_path = os.path.join(filepath, file)

    try:
        # read_csv 기능으로 파일 읽음
        df = pd.read_csv(file_path, dtype='object')

        # 파일이 비어있는지 확인
        if not df.empty:
            # 병합된 데이터프레임에 추가
            merged_df = merged_df.append(df)
        else:
            print(f"Warning: File {file} is empty. Skipping...")
            # 비어있는 파일 삭제
            os.remove(file_path)
    except pd.errors.EmptyDataError:
        print(f"Error: File {file} is empty. Skipping...")
        # 비어있는 파일 삭제
        os.remove(file_path)

# 병합 엑셀 파일 저장
output_filepath = "/content/drive/MyDrive/Colab Notebooks/Unclock-watchers/news/merged_news_2019~2023.csv"
merged_df.to_csv(output_filepath, index=False, encoding='utf-8-sig', mode='w')

# 병합된 데이터프레임 출력
print(merged_df.head())

# 병합된 데이터프레임의 크기 출력
print("Merged DataFrame Shape:", merged_df.shape)

FileNotFoundError: [WinError 3] 지정된 경로를 찾을 수 없습니다: '/content/drive/MyDrive/Colab Notebooks/Unclock-watchers/news/'

In [None]:
import pandas as pd

merged_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Unclock-watchers/news/merged_news_2019~2023.csv")

In [None]:
# 'title' 컬럼 삭제
merged_df = merged_df.drop(columns=['Unnamed: 0','url','title', 'media'])

In [None]:
merged_df.rename(columns={'content':'Text'}, inplace = True)

In [None]:
# 'date' 열을 datetime 형식으로 변환
merged_df['date'] = pd.to_datetime(merged_df['date'])

# 'date' 열을 원하는 형식으로 변환
merged_df['date'] = merged_df['date'].dt.strftime("%Y-%m-%d")

merged_df

Unnamed: 0,date,Text
0,2019-08-26,Leading stock exchange BSE on Monday launched ...
1,2019-08-26,Leading stock exchange BSE on Monday launched ...
2,2019-11-22,New Delhi: Leading stock exchange NSE on Frida...
3,2019-03-29,The government has left unchanged the interest...
4,2019-06-28,Interest rates on small savings schemes such a...
...,...,...
395826,2023-09-21,Our portfolio is strategically focused on sect...
395827,2023-03-01,"In this episode of ""Intelligence Matters,"" hos..."
395828,2023-03-01,"In this episode of ""Intelligence Matters,"" hos..."
395829,2023-06-01,With help from Eli Okun and Garrett Ross Kevin...


##### FOMC, Interview, Speech

In [None]:
import pandas as pd

# 주어진 텍스트 파일 경로
file_path = '/content/drive/MyDrive/Colab Notebooks/Unclock-watchers/merge/merged_2018~2023.txt'

# 각 항목을 담을 리스트
data_list = []

# 텍스트 파일을 읽어서 각 항목의 시작 지점을 찾아 리스트에 추가
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()
    for line in lines:
        # 예시로 각 행이 개별 문장 또는 단락으로 구분되어 있다고 가정하고 '\n'로 나누기
        parts = line.strip().split('\n')

        # 리스트에 추가
        data_list.extend(parts)

# 데이터프레임으로 변환
df = pd.DataFrame(data_list, columns=['Text'])
df

Unnamed: 0,Text
0,"﻿""STEVE LIESMAN. Mr. Chairman right, the micro..."
1,"""JIM TANKERSLEY. Hi, Mr. Chairman. Jim Tankers..."
2,"""NICK TIMIRAOS. Thank you. Nick Timiraos, the ..."
3,"""SAM FLEMING. Thanks very much. Sam Fleming fr..."
4,"""HOWARD SCHNEIDER. Howard Schneider with Reute..."
...,...
128,"""Minutes of the Federal Open Market Committee ..."
129,"""Minutes of the Federal Open Market Committee ..."
130,"""Minutes of the Federal Open Market Committee ..."
131,"""Minutes of the Federal Open Market Committee ..."


##### news, interview+speech, fomc concat

In [None]:
merged_df = merged_df.sort_values(by='date')
merged_df.to_csv('/content/drive/MyDrive/Colab Notebooks/Unclock-watchers/news/sort_merged_news_2019~2023.csv')

In [None]:
# news
news_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Unclock-watchers/news/sort_merged_news_2019~2023.csv")

In [None]:
import pandas as pd

# 'Text' 열의 부동 소수점 값을 처리하기 위해 'Text' 열을 문자열로 변환합니다
news_df['Text'] = news_df['Text'].astype(str)

# 'date'로 그룹화하고 텍스트를 결합
news_df = news_df.groupby('date')['Text'].apply(lambda x: ' '.join(x)).reset_index()

news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1784 entries, 0 to 1783
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    1784 non-null   object
 1   Text    1784 non-null   object
dtypes: object(2)
memory usage: 28.0+ KB


In [None]:
news_df['Text'][0]

In [None]:
# interview + speech
is_df = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Unclock-watchers/news/연설+인터뷰.xlsx")
is_df.rename(columns={'Unnamed: 0':'date', 'Unnamed: 1':'Text'}, inplace = True)
is_df.sort_values(by = 'date', inplace = True)
is_df = is_df.groupby('date')['Text'].apply(lambda x: ' '.join(x)).reset_index()

# 2018년 데이터 제외
is_df = is_df[is_df['date'].str.startswith('2018') == False]
is_df.sort_values(by='date', inplace=True)

is_df.info()

In [None]:
# fomc
fomc_df = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Unclock-watchers/news/fomc회의록 merged.xlsx")
# 'Text' 열에서 '\n'을 공백으로 대체
fomc_df['Text'] = fomc_df['Text'].str.replace('\n', ' ')
fomc_df['date'] = fomc_df['date'].astype(str)

# 2018년 데이터 제외
fomc_df = fomc_df[fomc_df['date'].str.startswith('2018') == False]
fomc_df.sort_values(by='date', inplace=True)

fomc_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38 entries, 38 to 5
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    38 non-null     object
 1   Text    38 non-null     object
dtypes: object(2)
memory usage: 912.0+ bytes


In [None]:
# news_df + is_df + fomc_df
import pandas as pd

df = pd.concat([news_df, is_df, fomc_df], axis=0, join='outer')

# 'date' 열을 datetime 형식으로 변환
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# 'date' 열의 값을 문자열 형식으로 변환
df['date'] = df['date'].dt.strftime("%Y-%m-%d")
df.sort_values(by = 'date', inplace = True)
df

Unnamed: 0,date,Text
0,2019-01-01,The fear of automation and artificial intellig...
1,2019-01-02,The manufacturing sector hit a nine-month low ...
2,2019-01-03,"Futures put the Nasdaq Composite, which is hea..."
3,2019-01-04,KHARTOUM/CAIRO (Reuters) - Short of time to sa...
4,2019-01-05,Federal Reserve Chairman Jerome Powell on Frid...
...,...,...
1779,2023-11-16,NEW YORK -- Stocks drifted to a mixed finish T...
1780,2023-11-17,"Krishna Sanghavi, CIO-Equities, Mahindra Manul..."
1781,2023-11-18,Comment on this story Comment Add to your save...
1782,2023-11-19,Longer-tenure EMIs (equated monthly instalment...


In [None]:
# 'date'로 그룹화하고 텍스트를 결합
df = df.groupby('date')['Text'].apply(lambda x: ' '.join(x)).reset_index()

df.to_csv('/content/drive/MyDrive/Colab Notebooks/Unclock-watchers/news/all_merged_data.csv')

##### 금리

In [None]:
rate = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Unclock-watchers/news/미국 1개월 채권수익률 과거 데이터 (1).csv')
rate.rename(columns={'날짜':'date', '종가':'closing price', '변동 %':'fluctuation'}, inplace = True)
rate = rate.drop(columns=['시가', '고가', '저가'])

In [None]:
rate = rate[rate['date'].str.startswith('2018') == False]
rate.sort_values(by = 'date', inplace = True)

rate

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rate.sort_values(by = 'date', inplace = True)


Unnamed: 0,date,closing price,fluctuation
1251,2019- 01- 01,2.435,0.00%
1250,2019- 01- 02,2.394,-1.69%
1249,2019- 01- 03,2.351,-1.80%
1248,2019- 01- 04,2.412,2.59%
1247,2019- 01- 07,2.410,-0.08%
...,...,...,...
4,2023- 10- 25,5.399,-0.06%
3,2023- 10- 26,5.395,-0.07%
2,2023- 10- 27,5.408,0.24%
1,2023- 10- 30,5.400,-0.15%


##### 금리가 없는 날짜 삭제

In [None]:
df['date'] = pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1784 entries, 0 to 1783
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1784 non-null   datetime64[ns]
 1   Text    1784 non-null   object        
dtypes: datetime64[ns](1), object(1)
memory usage: 28.0+ KB


In [None]:
rate['date'] = pd.to_datetime(rate['date'])
rate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1252 entries, 1251 to 0
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1252 non-null   datetime64[ns]
 1   closing price  1252 non-null   float64       
 2   fluctuation    1252 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 39.1+ KB


In [None]:
data_df = pd.merge(df, rate, how='outer', on='date')
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1785 entries, 0 to 1784
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1785 non-null   datetime64[ns]
 1   Text           1784 non-null   object        
 2   closing price  1252 non-null   float64       
 3   fluctuation    1252 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 69.7+ KB


In [None]:
data_df = data_df.dropna()
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1251 entries, 0 to 1763
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1251 non-null   datetime64[ns]
 1   Text           1251 non-null   object        
 2   closing price  1251 non-null   float64       
 3   fluctuation    1251 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 48.9+ KB


In [None]:
data_df.to_csv('/content/drive/MyDrive/Colab Notebooks/Unclock-watchers/news/complete_data.csv')

##### 전처리: 금리 + fomc, interview, speech (2019.01.01 ~ 2023.10.31)

In [None]:
import pandas as pd

data_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Unclock-watchers/news/complete_data.csv')

In [None]:
data_df.drop('Unnamed: 0', axis = 1, inplace = True)

In [None]:
data_df

Unnamed: 0,date,Text,closing price,fluctuation
0,2019-01-01,The fear of automation and artificial intellig...,2.435,0.00%
1,2019-01-02,The manufacturing sector hit a nine-month low ...,2.394,-1.69%
2,2019-01-03,"Futures put the Nasdaq Composite, which is hea...",2.351,-1.80%
3,2019-01-04,KHARTOUM/CAIRO (Reuters) - Short of time to sa...,2.412,2.59%
4,2019-01-07,LONDON (Reuters) - A set of strong employment ...,2.410,-0.08%
...,...,...,...,...
1246,2023-10-25,New home sales in the United States surged hig...,5.399,-0.06%
1247,2023-10-26,US stocks fell on Thursday under the pressure ...,5.395,-0.07%
1248,2023-10-27,The benchmark S&amp;P 500 index closed in corr...,5.408,0.24%
1249,2023-10-30,"Petronas, Greenko Founders, GIC Set to Put $2b...",5.400,-0.15%


###### 정규 표현식 적용_숫자, 특수문자, 2개 이하 삭제

In [None]:
# preprocessiong: 정규화
import re

data_df['Text'] = data_df['Text'].str.lower()
data_df['Text'] = data_df.apply(lambda row: re.sub(r'\b\w{1,2}\b|[^a-z\s]', '', row['Text']), axis=1)
data_df.to_csv('/content/drive/MyDrive/Colab Notebooks/Unclock-watchers/news/preprocessing_1.csv')

In [None]:
data_df.head(1)

######  . . .

In [None]:
from nltk.tokenize import word_tokenize

# word_tokenize 해서 Text에 덮어쓰기
data_df['Text'] = data_df['Text'].apply(word_tokenize)

In [None]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# 불용어 제거해서 덮어쓰기
data_df['Text'] = data_df['Text'].apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words])

# 결과 확인
data_df.head(3)

###### 행 별 불용어 처리

In [None]:
# 'Text' 열에 대해 불용어 제거 수행
data_df['Text'] = data_df['Text'].apply(lambda row: ' '.join([word for word in word_tokenize(row) if word.lower() not in stop_words]))
data_df

Unnamed: 0,date,Text,closing price,fluctuation
0,2019-01-01,fear automation artificial intelligence spurre...,2.435,0.00%
1,2019-01-02,manufacturing sector hit ninemonth low decembe...,2.394,-1.69%
2,2019-01-03,futures put nasdaq composite heavily weighted ...,2.351,-1.80%
3,2019-01-04,khartoumcairo reuters short time save sinking ...,2.412,2.59%
4,2019-01-07,london reuters set strong employment data unit...,2.410,-0.08%
...,...,...,...,...
1246,2023-10-25,new home sales united states surged higher sep...,5.399,-0.06%
1247,2023-10-26,stocks fell thursday pressure disappointing th...,5.395,-0.07%
1248,2023-10-27,benchmark amp index closed correction territor...,5.408,0.24%
1249,2023-10-30,petronas greenko founders gic set put greenpet...,5.400,-0.15%


In [None]:
data_df.head(1)

Unnamed: 0,date,Text,closing price,fluctuation
0,2019-01-01,the fear automation and artificial intelligen...,2.435,0.00%


###### 행별 불용어 및 품사

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords') # 불용어처리
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91pp9\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91pp9\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\91pp9\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91pp9\AppData\Roaming\nltk_data...


True

In [53]:
import pandas as pd

data_df = pd.read_csv("C:/Users/91pp9/Desktop/종로 새싹_핀테크 데이터 분석가 과정_1기/teamproject/Unclock-Watchers git/Minji/data/preprocessing_1.csv", index_col = 0)

In [54]:
data_df

Unnamed: 0,date,Text,closing price,fluctuation
0,2019-01-01,the fear automation and artificial intelligen...,2.435,0.00%
1,2019-01-02,the manufacturing sector hit ninemonth low d...,2.394,-1.69%
2,2019-01-03,futures put the nasdaq composite which heavil...,2.351,-1.80%
3,2019-01-04,khartoumcairo reuters short time save his s...,2.412,2.59%
4,2019-01-07,london reuters set strong employment data ...,2.410,-0.08%
...,...,...,...,...
1246,2023-10-25,new home sales the united states surged highe...,5.399,-0.06%
1247,2023-10-26,stocks fell thursday under the pressure dis...,5.395,-0.07%
1248,2023-10-27,the benchmark amp index closed correction te...,5.408,0.24%
1249,2023-10-30,petronas greenko founders gic set put gree...,5.400,-0.15%


In [55]:
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = stopwords.words('english')

data_df['Text'] = data_df['Text'].apply(lambda row: [word for word in word_tokenize(row) if word.lower() not in stop_words])

In [56]:
data_df

Unnamed: 0,date,Text,closing price,fluctuation
0,2019-01-01,"[fear, automation, artificial, intelligence, s...",2.435,0.00%
1,2019-01-02,"[manufacturing, sector, hit, ninemonth, low, d...",2.394,-1.69%
2,2019-01-03,"[futures, put, nasdaq, composite, heavily, wei...",2.351,-1.80%
3,2019-01-04,"[khartoumcairo, reuters, short, time, save, si...",2.412,2.59%
4,2019-01-07,"[london, reuters, set, strong, employment, dat...",2.410,-0.08%
...,...,...,...,...
1246,2023-10-25,"[new, home, sales, united, states, surged, hig...",5.399,-0.06%
1247,2023-10-26,"[stocks, fell, thursday, pressure, disappointi...",5.395,-0.07%
1248,2023-10-27,"[benchmark, amp, index, closed, correction, te...",5.408,0.24%
1249,2023-10-30,"[petronas, greenko, founders, gic, set, put, g...",5.400,-0.15%


In [57]:
# preprocessing_2: 불용어 
data_df.to_csv("C:/Users/91pp9/Desktop/종로 새싹_핀테크 데이터 분석가 과정_1기/teamproject/Unclock-Watchers git/Minji/data/preprocessing_2.csv")

In [2]:
import pandas as pd

data_df = pd.read_csv("C:/Users/91pp9/Desktop/종로 새싹_핀테크 데이터 분석가 과정_1기/teamproject/Unclock-Watchers git/Minji/data/preprocessing_2.csv", index_col = 0)
# data_df.drop('Unnamed: 0', axis = 1, inplace = True)
data_df

Unnamed: 0,date,Text,closing price,fluctuation
0,2019-01-01,"['fear', 'automation', 'artificial', 'intellig...",2.435,0.00%
1,2019-01-02,"['manufacturing', 'sector', 'hit', 'ninemonth'...",2.394,-1.69%
2,2019-01-03,"['futures', 'put', 'nasdaq', 'composite', 'hea...",2.351,-1.80%
3,2019-01-04,"['khartoumcairo', 'reuters', 'short', 'time', ...",2.412,2.59%
4,2019-01-07,"['london', 'reuters', 'set', 'strong', 'employ...",2.410,-0.08%
...,...,...,...,...
1246,2023-10-25,"['new', 'home', 'sales', 'united', 'states', '...",5.399,-0.06%
1247,2023-10-26,"['stocks', 'fell', 'thursday', 'pressure', 'di...",5.395,-0.07%
1248,2023-10-27,"['benchmark', 'amp', 'index', 'closed', 'corre...",5.408,0.24%
1249,2023-10-30,"['petronas', 'greenko', 'founders', 'gic', 'se...",5.400,-0.15%


In [4]:
import re
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# 데이터프레임의 'Text' 열에 있는 각 행에 대해 처리
for idx, row in data_df.iterrows():
    # 정규 표현식을 사용하여 특수문자 제거 
    cleaned_text = re.sub(r"[^a-zA-Z0-9\s]", "", row['Text'])
    # 토큰화 및 품사 태깅
    tokens = word_tokenize(cleaned_text)
    pos_tags = pos_tag(tokens)
    
    # 품사 태깅 결과를 데이터프레임에 저장 (예시로 'POS_Tagging' 열에 저장)
    data_df.at[idx, 'Text'] = pos_tags

# 결과 확인
data_df['Text'][0]

[('fear', 'NN'),
 ('automation', 'NN'),
 ('artificial', 'JJ'),
 ('intelligence', 'NN'),
 ('spurred', 'VBD'),
 ('countless', 'JJ'),
 ('conversations', 'NNS'),
 ('future', 'JJ'),
 ('work', 'NN'),
 ('humans', 'NNS'),
 ('fare', 'JJ'),
 ('age', 'NN'),
 ('robots', 'VBD'),
 ('many', 'JJ'),
 ('fear', 'JJ'),
 ('technological', 'JJ'),
 ('unemployment', 'NN'),
 ('others', 'NNS'),
 ('preemptively', 'RB'),
 ('denounce', 'JJ'),
 ('future', 'JJ'),
 ('income', 'NN'),
 ('inequality', 'NN'),
 ('confront', 'VBP'),
 ('dismal', 'JJ'),
 ('future', 'JJ'),
 ('work', 'NN'),
 ('many', 'JJ'),
 ('policy', 'NN'),
 ('world', 'NN'),
 ('tech', 'NN'),
 ('world', 'NN'),
 ('even', 'RB'),
 ('aspiring', 'VBG'),
 ('presidential', 'JJ'),
 ('candidates', 'NNS'),
 ('like', 'IN'),
 ('andrew', 'NN'),
 ('yang', 'NN'),
 ('tout', 'IN'),
 ('promise', 'NN'),
 ('universal', 'JJ'),
 ('basic', 'JJ'),
 ('income', 'NN'),
 ('ubi', 'JJ'),
 ('overcome', 'JJ'),
 ('uncertainties', 'NNS'),
 ('whether', 'IN'),
 ('policymakers', 'NNS'),
 ('even'

In [6]:
# preprocessing_3: 품사 태깅 
data_df.to_csv("C:/Users/91pp9/Desktop/종로 새싹_핀테크 데이터 분석가 과정_1기/teamproject/Unclock-Watchers git/Minji/data/preprocessing_3.csv")

In [1]:
import pandas as pd

data_df = pd.read_csv('C:/Users/91pp9/Desktop/종로 새싹_핀테크 데이터 분석가 과정_1기/teamproject/Unclock-Watchers git/Minji/data/preprocessing_3.csv', index_col = 0)
data_df

Unnamed: 0,date,Text,closing price,fluctuation
0,2019-01-01,"[('fear', 'NN'), ('automation', 'NN'), ('artif...",2.435,0.00%
1,2019-01-02,"[('manufacturing', 'NN'), ('sector', 'NN'), ('...",2.394,-1.69%
2,2019-01-03,"[('futures', 'NNS'), ('put', 'VBD'), ('nasdaq'...",2.351,-1.80%
3,2019-01-04,"[('khartoumcairo', 'NN'), ('reuters', 'NNS'), ...",2.412,2.59%
4,2019-01-07,"[('london', 'JJ'), ('reuters', 'NNS'), ('set',...",2.410,-0.08%
...,...,...,...,...
1246,2023-10-25,"[('new', 'JJ'), ('home', 'NN'), ('sales', 'NNS...",5.399,-0.06%
1247,2023-10-26,"[('stocks', 'NNS'), ('fell', 'VBD'), ('thursda...",5.395,-0.07%
1248,2023-10-27,"[('benchmark', 'NN'), ('amp', 'NN'), ('index',...",5.408,0.24%
1249,2023-10-30,"[('petronas', 'NNS'), ('greenko', 'VBP'), ('fo...",5.400,-0.15%


In [2]:
data_df['Text'][0]



In [3]:
# 필요한 품사만 추출
desired_pos = ['NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBT', 'VBN', 'VBP', 'VBZ', 'RB', 'RBP', 'RBS', 'JJ', 'JJR', 'JJS']

# 'Text' 열의 각 행에 대해 작업 수행
for idx, row in data_df.iterrows():
    # 'Text' 열의 값이 문자열인 경우에 대비하여 리스트로 변환
    word_pos_list = eval(row['Text']) if isinstance(row['Text'], str) else row['Text']
    
    # 각 튜플에서 특정 품사들만 추출하여 새로운 리스트 생성
    filtered_words = [word for word, pos in word_pos_list if pos in desired_pos]
    
    # 새로운 리스트를 'Text' 열에 저장
    data_df.at[idx, 'Text'] = filtered_words


In [4]:
data_df['Text'][0]

['fear',
 'automation',
 'artificial',
 'intelligence',
 'spurred',
 'countless',
 'conversations',
 'future',
 'work',
 'humans',
 'fare',
 'age',
 'robots',
 'many',
 'fear',
 'technological',
 'unemployment',
 'others',
 'preemptively',
 'denounce',
 'future',
 'income',
 'inequality',
 'confront',
 'dismal',
 'future',
 'work',
 'many',
 'policy',
 'world',
 'tech',
 'world',
 'even',
 'presidential',
 'candidates',
 'andrew',
 'yang',
 'promise',
 'universal',
 'basic',
 'income',
 'ubi',
 'overcome',
 'uncertainties',
 'policymakers',
 'even',
 'consider',
 'ideas',
 'ubi',
 'given',
 'recent',
 'studies',
 'suggest',
 'expanded',
 'labor',
 'market',
 'future',
 'relatively',
 'strong',
 'current',
 'economic',
 'conditions',
 'still',
 'open',
 'question',
 'rather',
 'familiar',
 'concerns',
 'ubi',
 'risk',
 'dependency',
 'price',
 'tag',
 'implement',
 'work',
 'often',
 'provides',
 'life',
 'worthwhile',
 'recognize',
 'case',
 'broad',
 'radical',
 'reform',
 'economic',

In [5]:
# preprocessing_4: 명사, 동사, 형용사, 부사만 남긴 것 
data_df.to_csv("C:/Users/91pp9/Desktop/종로 새싹_핀테크 데이터 분석가 과정_1기/teamproject/Unclock-Watchers git/Minji/data/preprocessing_4.csv")

###### 표제어 추출

In [6]:
import pandas as pd

data_df = pd.read_csv('C:/Users/91pp9/Desktop/종로 새싹_핀테크 데이터 분석가 과정_1기/teamproject/Unclock-Watchers git/Minji/data/preprocessing_4.csv', index_col = 0)
data_df

Unnamed: 0,date,Text,closing price,fluctuation
0,2019-01-01,"['fear', 'automation', 'artificial', 'intellig...",2.435,0.00%
1,2019-01-02,"['manufacturing', 'sector', 'hit', 'ninemonth'...",2.394,-1.69%
2,2019-01-03,"['futures', 'put', 'composite', 'heavily', 'we...",2.351,-1.80%
3,2019-01-04,"['khartoumcairo', 'reuters', 'short', 'time', ...",2.412,2.59%
4,2019-01-07,"['london', 'reuters', 'set', 'strong', 'employ...",2.410,-0.08%
...,...,...,...,...
1246,2023-10-25,"['new', 'home', 'sales', 'united', 'states', '...",5.399,-0.06%
1247,2023-10-26,"['stocks', 'fell', 'thursday', 'pressure', 'di...",5.395,-0.07%
1248,2023-10-27,"['benchmark', 'amp', 'index', 'closed', 'corre...",5.408,0.24%
1249,2023-10-30,"['petronas', 'greenko', 'founders', 'gic', 'se...",5.400,-0.15%


In [8]:
from nltk.stem import WordNetLemmatizer

# NLTK WordNetLemmatizer를 사용하여 표제어 추출
lemmatizer = WordNetLemmatizer()

# 'Text' 열의 각 행에 대해 작업 수행
for idx, row in data_df.iterrows():
    # 'Text' 열의 값이 문자열인 경우에 대비하여 리스트로 변환
    word_list = eval(row['Text']) if isinstance(row['Text'], str) else row['Text']
    
    # 표제어 추출
    lemmatized_words = [lemmatizer.lemmatize(word) for word in word_list]
    
    # 새로운 리스트를 'Text' 열에 저장
    data_df.at[idx, 'Text'] = lemmatized_words


In [11]:
# preprocessing_5: 정규 표현식, 불용어 처리, 품사 태깅 및 제거, 표제어 추출 완료 
data_df.to_csv("C:/Users/91pp9/Desktop/종로 새싹_핀테크 데이터 분석가 과정_1기/teamproject/Unclock-Watchers git/Minji/data/preprocessing_5.csv")

##### 라벨_상승, 하락

In [21]:
import pandas as pd

data_df = pd.read_csv('C:/Users/91pp9/Desktop/종로 새싹_핀테크 데이터 분석가 과정_1기/teamproject/Unclock-Watchers git/Minji/data/preprocessing_4.csv', index_col = 0)
data_df

Unnamed: 0,date,Text,closing price,fluctuation
0,2019-01-01,"['fear', 'automation', 'artificial', 'intellig...",2.435,0.00%
1,2019-01-02,"['manufacturing', 'sector', 'hit', 'ninemonth'...",2.394,-1.69%
2,2019-01-03,"['futures', 'put', 'composite', 'heavily', 'we...",2.351,-1.80%
3,2019-01-04,"['khartoumcairo', 'reuters', 'short', 'time', ...",2.412,2.59%
4,2019-01-07,"['london', 'reuters', 'set', 'strong', 'employ...",2.410,-0.08%
...,...,...,...,...
1246,2023-10-25,"['new', 'home', 'sales', 'united', 'states', '...",5.399,-0.06%
1247,2023-10-26,"['stocks', 'fell', 'thursday', 'pressure', 'di...",5.395,-0.07%
1248,2023-10-27,"['benchmark', 'amp', 'index', 'closed', 'corre...",5.408,0.24%
1249,2023-10-30,"['petronas', 'greenko', 'founders', 'gic', 'se...",5.400,-0.15%


In [22]:
data_df['Daily_rtn'] = data_df['closing price'].pct_change()
data_df

Unnamed: 0,date,Text,closing price,fluctuation,Daily_rtn
0,2019-01-01,"['fear', 'automation', 'artificial', 'intellig...",2.435,0.00%,
1,2019-01-02,"['manufacturing', 'sector', 'hit', 'ninemonth'...",2.394,-1.69%,-0.016838
2,2019-01-03,"['futures', 'put', 'composite', 'heavily', 'we...",2.351,-1.80%,-0.017962
3,2019-01-04,"['khartoumcairo', 'reuters', 'short', 'time', ...",2.412,2.59%,0.025946
4,2019-01-07,"['london', 'reuters', 'set', 'strong', 'employ...",2.410,-0.08%,-0.000829
...,...,...,...,...,...
1246,2023-10-25,"['new', 'home', 'sales', 'united', 'states', '...",5.399,-0.06%,-0.000555
1247,2023-10-26,"['stocks', 'fell', 'thursday', 'pressure', 'di...",5.395,-0.07%,-0.000741
1248,2023-10-27,"['benchmark', 'amp', 'index', 'closed', 'corre...",5.408,0.24%,0.002410
1249,2023-10-30,"['petronas', 'greenko', 'founders', 'gic', 'se...",5.400,-0.15%,-0.001479


##### n-gram