# 데이터 수집 

## 수집기간 정의

In [16]:
import GetOldTweets3 as got
import datetime
import time
import pandas as pd

In [42]:
days_range = []

# datetime 패키지, datetime 클래스(날짜와 시간 함께 저장)
# strptime 메서드: 문자열 반환
start = datetime.datetime.strptime("2017-01-20", "%Y-%m-%d")
end = datetime.datetime.strptime("2020-07-31", "%Y-%m-%d")

date_generated = [start+datetime.timedelta(days=x) for x in range(0, (end-start).days)]

for date in date_generated:
    days_range.append(date.strftime("%Y-%m-%d"))
    
print("===트윗 수집 기간: {} ~ {}===".format(days_range[0], days_range[-1]))
print("===총 {}일 데이터 수집===".format(len(days_range)))

===트윗 수집 기간: 2017-01-20 ~ 2020-07-30===
===총 1288일 데이터 수집===


datetime.timedelta 클래스
두 date, time, datetime 인스턴스 간의 차이를 마이크로초 해상도로 나타내는 기간


timedelta 객체
두 날짜나 시간의 차이인 기간
class datetime.timedelta(days=0, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=0, weeks=0)
모든 인자는 선택적이며 기본값은 0입니다. 인자는 정수나 부동 소수점 수일 수 있으며, 양수나 음수일 수 있습니다.

days, seconds 및 microseconds만 내부적으로 저장됩니다. 인자는 이 단위로 변환됩니다:

밀리 초는 1000마이크로초로 변환됩니다.

분은 60초로 변환됩니다.

시간은 3600초로 변환됩니다.

주는 7일로 변환됩니다.

## 트윗 수집

In [10]:
# 수집 기준 정의
tweet_criteria = got.manager.TweetCriteria().setUsername("realDonaldTrump").setSince("2017-01-20").setUntil("2020-07-31")

In [14]:
print("데이터 수집 시작===")
start_time = time.time()
tweet = got.manager.TweetManager.getTweets(tweet_criteria)

print("데이터 수집 완료==={0:0.2f}분".format((time.time() - start_time)/60))
print("===총 트윗 개수 {}===".format(len(tweet)))

데이터 수집 시작===
데이터 수집 완료===17.56분
===총 트윗 개수 12599===


## 변수 저장하기

In [55]:
# 날짜, 트윗내용
from tqdm.notebook import tqdm

tweet_list = []

for i in tqdm(tweet):
    
    tweet_date = i.date.strftime("%Y-%m-%d")
    tweet_time = i.date.strftime("%H:%M:%d")
    content = i.text
    
    info_list = [tweet_date, tweet_time, content]
    tweet_list.append(info_list)

HBox(children=(FloatProgress(value=0.0, max=12599.0), HTML(value='')))




In [56]:
tweet_df = pd.DataFrame(tweet_list, columns = ["날짜", "시간", "내용"])

tweet_df.to_csv("tweets_trump_to2020.csv", index = False)
print("==={}개 트윗 저장===".format(len(tweet_list)))

===12599개 트윗 저장===


# 데이터 전처리

In [92]:
df = pd.read_csv("tweets_trump_to2020.csv", parse_dates=["날짜"])
df.head(20)

Unnamed: 0,날짜,시간,내용
0,2020-07-30,23:33:30,Great to spend time with Mike Singletary while...
1,2020-07-30,22:45:30,My Administration has been focused on finding ...
2,2020-07-30,22:34:30,"As the Wall goes up, illegal crossings go down..."
3,2020-07-30,21:20:30,
4,2020-07-30,20:23:30,"We are going to WIN the 2020 Election, BIG! #MAGA"
5,2020-07-30,20:22:30,Must know Election results on the night of the...
6,2020-07-30,20:22:30,Glad I was able to get the very dishonest Lame...
7,2020-07-30,18:42:30,"...the phone with his amazing wife Gloria, dau..."
8,2020-07-30,18:42:30,"My friend Herman Cain, a Powerful Voice of Fre..."
9,2020-07-30,14:26:30,"Support Patio Pizza and its wonderful owner, G..."


In [93]:
df.tail(20)

Unnamed: 0,날짜,시간,내용
12579,2017-01-24,11:11:24,Will be meeting at 9:00 with top automobile ex...
12580,2017-01-23,11:38:23,Busy week planned with a heavy focus on jobs a...
12581,2017-01-22,14:23:22,Peaceful protests are a hallmark of our democr...
12582,2017-01-22,12:51:22,"Wow, television ratings just out: 31 million p..."
12583,2017-01-22,12:47:22,Watched protests yesterday but was under the i...
12584,2017-01-22,12:35:22,Had a great meeting at CIA Headquarters yester...
12585,2017-01-21,11:53:21,A fantastic day and evening in Washington D.C....
12586,2017-01-20,18:13:20,TO ALL AMERICANS https://www.facebook.com/Dona...
12587,2017-01-20,18:00:20,"So to all Americans, in every city near and fa..."
12588,2017-01-20,17:58:20,It is time to remember that...https://www.face...


In [94]:
df.dtypes

날짜    datetime64[ns]
시간            object
내용            object
dtype: object

In [99]:
df['시간'] = pd.to_datetime(df['시간'], errors='coerce')

In [100]:
df.dtypes

날짜            object
시간    datetime64[ns]
내용            object
dtype: object

In [101]:
df.head()

Unnamed: 0,날짜,시간,내용
0,2020-07-30,1900-01-01 23:33:30,Great to spend time with Mike Singletary while...
1,2020-07-30,1900-01-01 22:45:30,My Administration has been focused on finding ...
2,2020-07-30,1900-01-01 22:34:30,"As the Wall goes up, illegal crossings go down..."
3,2020-07-30,1900-01-01 21:20:30,
4,2020-07-30,1900-01-01 20:23:30,"We are going to WIN the 2020 Election, BIG! #MAGA"


In [97]:
df['날짜'] = df['날짜'].dt.date
df['연'] = df['날짜'].dt.year
df['월'] = df['날짜'].dt.month
df['일'] = df['날짜'].dt.day

AttributeError: Can only use .dt accessor with datetimelike values

In [75]:
df['시간'] = df['시간'].dt.time
df['시'] = df['시간'].dt.hour
df['분'] = df['시간'].dt.minute
df['초'] = df['시간'].dt.second

AttributeError: Can only use .dt accessor with datetimelike values

In [34]:
df[df['내용'].isnull()]

Unnamed: 0,날짜,시간,내용
3,2020-07-30,21:20:30S,
20,2020-07-29,19:46:29S,
21,2020-07-29,19:45:29S,
67,2020-07-26,02:57:26S,
68,2020-07-26,02:54:26S,
...,...,...,...
9448,2018-06-12,08:02:12S,
9511,2018-06-05,20:37:05S,
9550,2018-06-02,16:28:02S,
9920,2018-04-14,01:31:14S,


In [54]:
df[['날짜']]

Unnamed: 0,날짜
0,2020-07-30
1,2020-07-30
2,2020-07-30
3,2020-07-30
4,2020-07-30
...,...
12594,2017-01-20
12595,2017-01-20
12596,2017-01-20
12597,2017-01-20


In [None]:
from tqdm.notebook import tqdm

tweet_list = []

for i in tqdm(tweet):
    
    tweet_date = i.date.strftime("%Y-%m-%d")
    tweet_time = i.date.strftime("%H:%M:%dS")
    content = i.text
    
    info_list = [tweet_date, tweet_time, content]
    tweet_list.append(info_list)