# <center>2016年美国大选民意调查数据统计</center>

* 目标：分析希拉里和特朗普每个月的民意调查统计趋势
* 涉及知识点：
    * numpy读取文本文件
    * 处理日期格式数据
    * numpy的切片和索引
    * 过滤函数filter
    * numpy的统计方法
    * 列表推导式

In [1]:
import numpy as np

In [2]:
filename = '../data/presidential_polls.csv'
with open(filename, 'r') as f:
    col_names_str = f.readline()[:-1]  # [:-1]表示不读取末尾的换行符'\n'
# 将字符串拆分，并组成列表
col_name_lst = col_names_str.split(',')
print(col_name_lst)

['cycle', 'branch', 'type', 'matchup', 'forecastdate', 'state', 'startdate', 'enddate', 'pollster', 'grade', 'samplesize', 'population', 'poll_wt', 'rawpoll_clinton', 'rawpoll_trump', 'rawpoll_johnson', 'rawpoll_mcmullin', 'adjpoll_clinton', 'adjpoll_trump', 'adjpoll_johnson', 'adjpoll_mcmullin', 'multiversions', 'url', 'poll_id', 'question_id', 'createddate', 'timestamp']


In [3]:
# 时间，希拉里的原始民意调查、特朗普的原始民意调查、希拉里调整后的民意调查、特朗普调整后的民意调查
use_cols=['enddate', 'rawpoll_clinton', 'rawpoll_trump','adjpoll_clinton','adjpoll_trump']
# 获取use_cols中各个元素在col_name_lst中的位置
use_col_index_lst = [ col_name_lst.index(use_col_name) for use_col_name in use_cols]
use_col_index_lst

[7, 13, 14, 17, 18]

In [4]:
# 读取数据
data_array=np.loadtxt(filename,        #文件名
                      delimiter=',',   #分隔符
                      skiprows=1,      #跳过第一行，即跳过列名
                      dtype=str,       #将所有数据默认为str类型，稍后对日期数据进行转换
                      usecols=use_col_index_lst) #读取指定列的数据
data_array[0:10]

array([['10/31/2016', '37.69 ', '35.07 ', '42.64 ', '40.87 '],
       ['10/30/2016', '45.00 ', '46.00 ', '43.30 ', '44.73 '],
       ['10/30/2016', '48.00 ', '42.00 ', '46.30 ', '40.73 '],
       ['10/24/2016', '48.00 ', '45.00 ', '46.36 ', '45.31 '],
       ['10/25/2016', '46.00 ', '40.00 ', '45.33 ', '42.21 '],
       ['10/25/2016', '44.00 ', '41.00 ', '44.65 ', '42.27 '],
       ['10/31/2016', '44.60 ', '43.70 ', '46.22 ', '43.56 '],
       ['10/30/2016', '47.00 ', '44.00 ', '46.89 ', '43.50 '],
       ['10/27/2016', '41.70 ', '36.40 ', '41.23 ', '37.25 '],
       ['10/31/2016', '42.00 ', '40.00 ', '42.22 ', '41.70 ']],
      dtype='<U10')

In [5]:
# 查看数组形态
data_array.shape

(10236, 5)

In [6]:
import datetime

#处理日期格式数据
enddate_idx = use_cols.index('enddate')

#利用切片操作，取得日期列，然后将数组转换为列表，方便接下来操作
enddate_lst = data_array[:,enddate_idx].tolist() 

#将日期字符转换为日期
date_lst = [datetime.datetime.strptime(enddate, '%m/%d/%Y') for enddate in enddate_lst]

#构造年份-月份列表
month_lst=['%d-%02d'%(date_obj.year,date_obj.month)for date_obj in date_lst]
#print(month_lst)

month_array = np.array(month_lst)
months=np.unique(month_array)
months

array(['2015-11', '2015-12', '2016-01', '2016-02', '2016-03', '2016-04',
       '2016-05', '2016-06', '2016-07', '2016-08', '2016-09', '2016-10'],
      dtype='<U7')

In [7]:
rawpoll_trump_idx = use_cols.index('rawpoll_trump')
rawpoll_trump_data = data_array[:, rawpoll_trump_idx]

In [8]:
# rawpoll_trump_data1= rawpoll_trump_data.astype(float)

ValueError: could not convert string to float: 'rv'

In [9]:
# 查看
data_array[:,2].dtype

dtype('<U10')

In [11]:
def is_convert_float(s):
    """
    判断一个字符串能否转换为float
    """
    try:
        float(s)
    except:
        return False
    return True

In [12]:
def get_sum(str_array):
    """
    返回字符串数组中数字的总和
    """
    # 去掉不能转换成数字的数据
    cleaned_data = list(filter(is_convert_float, str_array))
    # 转换数据类型
    float_array = np.array(cleaned_data, float)
    
    return np.sum(float_array)

In [13]:
# 统计特朗普的民调
rawpoll_trump_results = []
for month in months:
    # 原始数据 rawpoll
    rawpoll_trump_month_data = rawpoll_trump_data[month_array == month]  
    # 统计当月的总票数
    rawpoll_trump_on_month_sum = get_sum(rawpoll_trump_month_data)
    rawpoll_trump_results.append((month,rawpoll_trump_on_month_sum))
rawpoll_trump_results

[('2015-11', 1948.1999999999998),
 ('2015-12', 4164.3),
 ('2016-01', 6267.0),
 ('2016-02', 7528.200000000001),
 ('2016-03', 9626.7),
 ('2016-04', 9396.3),
 ('2016-05', 11456.1061505),
 ('2016-06', 16551.9),
 ('2016-07', 21426.989999999998),
 ('2016-08', 59529.0),
 ('2016-09', 98418.9955574),
 ('2016-10', 155392.975543)]

In [14]:
# 统计希拉里的民调
rawpoll_clinton_idx = use_cols.index('rawpoll_clinton')
rawpoll_clinton_data = data_array[:, rawpoll_clinton_idx]

rawpoll_clinton_results = []
for month in months:
    # 原始数据 rawpoll
    rawpoll_clinton_month_data = rawpoll_clinton_data[month_array == month]  
    # 统计当月的总票数
    rawpoll_clinton_on_month_sum = get_sum(rawpoll_clinton_month_data)
    rawpoll_clinton_results.append((month,rawpoll_clinton_on_month_sum))
rawpoll_clinton_results

[('2015-11', 1920.0),
 ('2015-12', 4816.8),
 ('2016-01', 6861.60007685),
 ('2016-02', 8271.6002536),
 ('2016-03', 11656.202546),
 ('2016-04', 11911.8039268),
 ('2016-05', 12155.7247957),
 ('2016-06', 20721.0537163),
 ('2016-07', 22007.0138546),
 ('2016-08', 63619.396242),
 ('2016-09', 96476.6678689),
 ('2016-10', 162766.8889154)]