diff --git a/examples/GooglesWord2Vec.ipynb b/examples/GooglesWord2Vec.ipynb deleted file mode 100644 index 115c3b2..0000000 --- a/examples/GooglesWord2Vec.ipynb +++ /dev/null @@ -1,567 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using TensorFlow backend.\n" - ] - } - ], - "source": [ - "import collections\n", - "import numpy as np\n", - "import pandas as pd\n", - "import gensim\n", - "from keras.models import Sequential\n", - "from keras.layers import Dense\n", - "from keras.layers.convolutional import Convolution1D\n", - "from keras.layers.convolutional import MaxPooling1D\n", - "from keras.layers import Flatten\n", - "from keras.utils import np_utils\n", - "\n", - "from sklearn.preprocessing import LabelEncoder" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Set random seed (for reproducibility)\n", - "np.random.seed(1000)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Set includes 3000000 words\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:14: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n" - ] - } - ], - "source": [ - "#Get word vectors using googles pretrained word2vec \n", - "#takes a minute \n", - "google = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)\n", - "\n", - "#includes some stop words (i.e. the, also, should, but not a, and, of)\n", - "#includes misspellings \n", - "#includes commony paired words (i.e. New_York)\n", - "\n", - "vocab = google.vocab.keys()\n", - "total_vocab = len(vocab)\n", - "print (\"Set includes\", total_vocab, \"words\")\n", - "\n", - "# Copy word vectors and delete Word2Vec model and original corpus to save memory\n", - "X_vecs = google.wv\n", - "#del google #wait to explore model first " - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:2: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n", - " from ipykernel import kernelapp as app\n" - ] - }, - { - "data": { - "text/plain": [ - "[('global_warming', 0.889603853225708),\n", - " ('Climate_Change', 0.7147639393806458),\n", - " ('Climate', 0.6953692436218262),\n", - " ('Global_warming', 0.661054253578186),\n", - " ('climate', 0.6569506525993347),\n", - " ('greenhouse_gas_emissions', 0.6449477076530457),\n", - " ('greenhouse_gases', 0.6432511806488037),\n", - " ('carbon_emissions', 0.6395047307014465),\n", - " ('Global_Warming', 0.6281516551971436),\n", - " ('reducing_carbon_emissions', 0.6227284669876099)]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from gensim.models import KeyedVectors\n", - "google.wv.most_similar('climate_change')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "del google " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([-0.05419922, 0.01708984, -0.00527954, 0.33203125, -0.25 ,\n", - " -0.01397705, -0.15039062, -0.265625 , 0.01647949, 0.3828125 ,\n", - " -0.03295898, -0.09716797, -0.16308594, -0.04443359, 0.00946045,\n", - " 0.18457031, 0.03637695, 0.16601562, 0.36328125, -0.25585938,\n", - " 0.375 , 0.171875 , 0.21386719, -0.19921875, 0.13085938,\n", - " -0.07275391, -0.02819824, 0.11621094, 0.15332031, 0.09082031,\n", - " 0.06787109, -0.0300293 , -0.16894531, -0.20800781, -0.03710938,\n", - " -0.22753906, 0.26367188, 0.012146 , 0.18359375, 0.31054688,\n", - " -0.10791016, -0.19140625, 0.21582031, 0.13183594, -0.03515625,\n", - " 0.18554688, -0.30859375, 0.04785156, -0.10986328, 0.14355469,\n", - " -0.43554688, -0.0378418 , 0.10839844, 0.140625 , -0.10595703,\n", - " 0.26171875, -0.17089844, 0.39453125, 0.12597656, -0.27734375,\n", - " -0.28125 , 0.14746094, -0.20996094, 0.02355957, 0.18457031,\n", - " 0.00445557, -0.27929688, -0.03637695, -0.29296875, 0.19628906,\n", - " 0.20703125, 0.2890625 , -0.20507812, 0.06787109, -0.43164062,\n", - " -0.10986328, -0.2578125 , -0.02331543, 0.11328125, 0.23144531,\n", - " -0.04418945, 0.10839844, -0.2890625 , -0.09521484, -0.10351562,\n", - " -0.0324707 , 0.07763672, -0.13378906, 0.22949219, 0.06298828,\n", - " 0.08349609, 0.02929688, -0.11474609, 0.00534058, -0.12988281,\n", - " 0.02514648, 0.08789062, 0.24511719, -0.11474609, -0.296875 ,\n", - " -0.59375 , -0.29492188, -0.13378906, 0.27734375, -0.04174805,\n", - " 0.11621094, 0.28320312, 0.00241089, 0.13867188, -0.00683594,\n", - " -0.30078125, 0.16210938, 0.01171875, -0.13867188, 0.48828125,\n", - " 0.02880859, 0.02416992, 0.04736328, 0.05859375, -0.23828125,\n", - " 0.02758789, 0.05981445, -0.03857422, 0.06933594, 0.14941406,\n", - " -0.10888672, -0.07324219, 0.08789062, 0.27148438, 0.06591797,\n", - " -0.37890625, -0.26171875, -0.13183594, 0.09570312, -0.3125 ,\n", - " 0.10205078, 0.03063965, 0.23632812, 0.00582886, 0.27734375,\n", - " 0.20507812, -0.17871094, -0.31445312, -0.01586914, 0.13964844,\n", - " 0.13574219, 0.0390625 , -0.29296875, 0.234375 , -0.33984375,\n", - " -0.11816406, 0.10644531, -0.18457031, -0.02099609, 0.02563477,\n", - " 0.25390625, 0.07275391, 0.13574219, -0.00138092, -0.2578125 ,\n", - " -0.2890625 , 0.10107422, 0.19238281, -0.04882812, 0.27929688,\n", - " -0.3359375 , -0.07373047, 0.01879883, -0.10986328, -0.04614258,\n", - " 0.15722656, 0.06689453, -0.03417969, 0.16308594, 0.08642578,\n", - " 0.44726562, 0.02026367, -0.01977539, 0.07958984, 0.17773438,\n", - " -0.04370117, -0.00952148, 0.16503906, 0.17285156, 0.23144531,\n", - " -0.04272461, 0.02355957, 0.18359375, -0.41601562, -0.01745605,\n", - " 0.16796875, 0.04736328, 0.14257812, 0.08496094, 0.33984375,\n", - " 0.1484375 , -0.34375 , -0.14160156, -0.06835938, -0.14648438,\n", - " -0.02844238, 0.07421875, -0.07666016, 0.12695312, 0.05859375,\n", - " -0.07568359, -0.03344727, 0.23632812, -0.16308594, 0.16503906,\n", - " 0.1484375 , -0.2421875 , -0.3515625 , -0.30664062, 0.00491333,\n", - " 0.17675781, 0.46289062, 0.14257812, -0.25 , -0.25976562,\n", - " 0.04370117, 0.34960938, 0.05957031, 0.07617188, -0.02868652,\n", - " -0.09667969, -0.01281738, 0.05859375, -0.22949219, -0.1953125 ,\n", - " -0.12207031, 0.20117188, -0.42382812, 0.06005859, 0.50390625,\n", - " 0.20898438, 0.11230469, -0.06054688, 0.33203125, 0.07421875,\n", - " -0.05786133, 0.11083984, -0.06494141, 0.05639648, 0.01757812,\n", - " 0.08398438, 0.13769531, 0.2578125 , 0.16796875, -0.16894531,\n", - " 0.01794434, 0.16015625, 0.26171875, 0.31640625, -0.24804688,\n", - " 0.05371094, -0.0859375 , 0.17089844, -0.39453125, -0.00156403,\n", - " -0.07324219, -0.04614258, -0.16210938, -0.15722656, 0.21289062,\n", - " -0.15820312, 0.04394531, 0.28515625, 0.01196289, -0.26953125,\n", - " -0.04370117, 0.37109375, 0.04663086, -0.19726562, 0.3046875 ,\n", - " -0.36523438, -0.23632812, 0.08056641, -0.04248047, -0.14648438,\n", - " -0.06225586, -0.0534668 , -0.05664062, 0.18945312, 0.37109375,\n", - " -0.22070312, 0.04638672, 0.02612305, -0.11474609, 0.265625 ,\n", - " -0.02453613, 0.11083984, -0.02514648, -0.12060547, 0.05297852,\n", - " 0.07128906, 0.00063705, -0.36523438, -0.13769531, -0.12890625], dtype=float32)" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Explore the vectors \n", - "X_vecs['hello'] #check vectors \n", - "#X_vecs['global warming'.split()] #check vectors \n", - "#X_vecs['global_warming'] # Includes common phrases " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Full dataset: 6090\n", - "dataset without NaN: 6087\n" - ] - } - ], - "source": [ - "# Load in data\n", - "# One hot encode sentiment \n", - "\n", - "data = pd.read_csv(\"../core/data/tweet_global_warming.csv\", encoding=\"latin\")\n", - "print(\"Full dataset: {}\".format(data.shape[0]))\n", - "data['existence'].fillna(value='ambiguous', inplace = True) #replace NA's in existence with \"ambiguous\"\n", - "data['existence'].replace(('Y', 'N'), ('Yes', 'No'), inplace=True) #rename so encoder doesnt get confused\n", - "data = data.dropna() #now drop NA values\n", - "print(\"dataset without NaN: {}\".format(data.shape[0]))\n", - "X = data.iloc[:,0]\n", - "Y = data.iloc[:,1]\n", - "#print(\"Number of unique words: {}\".format(len(np.unique(X)))) ##why is this wrong?? ##\n", - "\n", - "#one hot encoding = dummy vars from categorical var \n", - "#Create a one-hot encoded binary matrix \n", - "#N, Y, Ambig\n", - "#1, 0, 0 \n", - "#0, 1, 0\n", - "#0, 0, 1\n", - "\n", - "#encode class as integers \n", - "encoder = LabelEncoder()\n", - "encoder.fit(Y)\n", - "encoded_Y = encoder.transform(Y) \n", - "\n", - "#convert integers to one hot encoded\n", - "Y = np_utils.to_categorical(encoded_Y)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "########I think we want to look at gensim over nltk for this data########\n", - "###################### See 3 cells below! #######\n", - "from nltk.stem.lancaster import LancasterStemmer\n", - "from nltk.tokenize import RegexpTokenizer\n", - "\n", - "corpus = X\n", - "# Tokenize and stem\n", - "tkr = RegexpTokenizer('[a-zA-Z0-9@]+')\n", - "stemmer = LancasterStemmer()\n", - "\n", - "tokenized_corpus = []\n", - "\n", - "for i, tweet in enumerate(corpus):\n", - " tokens = [stemmer.stem(t) for t in tkr.tokenize(tweet) if not t.startswith('@')]\n", - " tokenized_corpus.append(tokens)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def read_data(data_file):\n", - " for i, line in enumerate (data_file): \n", - " # do some pre-processing and return a list of words for each review text\n", - " yield gensim.utils.simple_preprocess (line)\n", - "\n", - "def build_dataset(vocab, n_words):\n", - " \"\"\"Process the top n_words from raw inputs (vocab from read_data) into a dataset.\"\"\"\n", - " count = [['UNK', -1]] #stores when word is found --> UNK = unknown \n", - " count.extend(collections.Counter(vocab).most_common(n_words - 1))\n", - " dictionary = dict()\n", - " for word, _ in count:\n", - " dictionary[word] = len(dictionary)\n", - " token = list() \n", - " unk_count = 0\n", - " for word in vocab: #\n", - " if word in dictionary:\n", - " index = dictionary[word]\n", - " else:\n", - " index = 0 # dictionary['UNK'] assigned to 0 \n", - " unk_count += 1\n", - " token.append(index) #outputs a list of integers that represent words\n", - " count[0][1] = unk_count\n", - " reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) #allows for word lookup by integer\n", - " return token, count, dictionary, reversed_dictionary" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of unique words: 12117\n" - ] - } - ], - "source": [ - "top_words = 20000 #use number higher than expected unique words\n", - "\n", - "tweet_vocab = list(read_data(data['tweet']))\n", - "flat_tweet_vocab = [item for sublist in tweet_vocab for item in sublist]\n", - "token, count, dictionary, reversed_dictionary = build_dataset(flat_tweet_vocab, top_words)\n", - "\n", - "print(\"Number of unique words: {}\".format(len(count))) #correct num of unique words " - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "using ntlk to preprocess: ['govern', 'report', 'say', 'glob', 'warm', 'may', 'caus', 'cant', 'ment', 'il', 'cnsnews', 'com', 'link']\n", - "using gensim to preprocess: ['government', 'report', 'says', 'global', 'warming', 'may', 'cause', 'cancer', 'mental', 'illness', 'cnsnews', 'com', 'link']\n" - ] - } - ], - "source": [ - "#Compare preprocessing methods \n", - "\n", - "#using ntlk \n", - "print ('using ntlk to preprocess:', tokenized_corpus[15])\n", - "\n", - "#using gensim simple preprocesser \n", - "print ('using gensim to preprocess:', tweet_vocab[15])" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "# Create train and test sets\n", - "# Generate random indexes\n", - "\n", - "test_split = 0.8\n", - "train_size = int(len(X)*test_split)\n", - "test_size = len(X) - train_size\n", - "vector_size = 300\n", - "window_size = 10\n", - "max_tweet_length=512\n", - "\n", - "indexes = set(np.random.choice(len(tweet_vocab), train_size + test_size, replace=False))\n", - "\n", - "X_train = np.zeros((train_size, max_tweet_length, vector_size))\n", - "Y_train = np.zeros((train_size, 3), dtype=np.int32)\n", - "X_test = np.zeros((test_size, max_tweet_length, vector_size))\n", - "Y_test = np.zeros((test_size, 3), dtype=np.int32)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "for i, index in enumerate(indexes):\n", - " for t, token in enumerate(tweet_vocab[index]):\n", - " if t >= max_tweet_length:\n", - " break\n", - " \n", - " if token not in X_vecs:\n", - " continue\n", - " \n", - " if i < train_size:\n", - " X_train[i, t, :] = X_vecs[token]\n", - " else:\n", - " X_test[i - train_size, t, :] = X_vecs[token]\n", - " \n", - " \n", - " if i < train_size:\n", - " Y_train[i, :] = Y[index]\n", - " else:\n", - " Y_test[i - train_size, :] = Y[index]" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 0.18652344 -0.14941406 0.05883789 0.12011719 -0.0279541 -0.1328125\n", - " 0.08837891 -0.203125 0.20410156 0.140625 -0.11328125 -0.08105469\n", - " -0.11328125 -0.07910156 -0.07519531 0.15625 0.10693359 0.20996094\n", - " 0.18554688 -0.06982422 -0.11230469 -0.06933594 -0.05932617 -0.11621094\n", - " 0.05859375 -0.02294922 -0.03417969 0.12597656 0.09570312 0.13378906\n", - " -0.10009766 -0.1328125 -0.12255859 0.09375 -0.11035156 0.00282288\n", - " -0.16113281 -0.08691406 -0.13671875 0.11230469 0.21972656 -0.16503906\n", - " -0.04711914 -0.06835938 0.06835938 -0.07128906 -0.00334167 0.05371094\n", - " 0.10644531 0.03637695 0.07177734 -0.14453125 -0.05883789 -0.02539062\n", - " -0.01708984 0.04443359 0.03833008 -0.05957031 -0.04736328 0.0088501\n", - " -0.01098633 0.11035156 0.0135498 -0.18359375 0.11181641 0.02648926\n", - " -0.06933594 0.06176758 -0.13964844 -0.05273438 -0.04248047 -0.07519531\n", - " 0.0703125 0.15332031 -0.125 -0.01489258 -0.02099609 -0.15820312\n", - " -0.0255127 0.02783203 -0.01403809 -0.22070312 -0.03295898 0.13378906\n", - " -0.22558594 -0.08154297 -0.08544922 0.05078125 -0.0559082 -0.15429688\n", - " -0.02563477 -0.04248047 -0.11035156 -0.05493164 -0.16503906 -0.04882812\n", - " -0.20703125 0.02966309 -0.0177002 0.08398438 -0.12011719 -0.09960938\n", - " -0.07714844 0.08056641 -0.13476562 -0.03149414 -0.13671875 -0.0859375\n", - " 0.02111816 -0.10839844 0.20117188 0.06347656 -0.01574707 0.10009766\n", - " 0.14648438 0.04370117 -0.05541992 0.15429688 -0.00848389 0.08984375\n", - " -0.13183594 -0.00836182 -0.11328125 -0.07226562 -0.20117188 0.06591797\n", - " 0.05078125 -0.0859375 0.14257812 0.10253906 -0.11181641 0.02905273\n", - " -0.11425781 -0.16601562 -0.05786133 -0.0612793 -0.14355469 0.00325012\n", - " -0.07275391 -0.20703125 0.08349609 -0.19433594 0.09423828 0.25390625\n", - " -0.00793457 0.15917969 -0.09375 0.07958984 -0.26757812 0.03955078\n", - " 0.01208496 0.11962891 -0.10595703 0.15429688 -0.00952148 0.10644531\n", - " -0.19433594 -0.05102539 -0.13085938 -0.10644531 -0.078125 0.10058594\n", - " 0.16894531 0.03540039 -0.17773438 0.02087402 -0.08056641 -0.13476562\n", - " 0.02880859 0.09619141 -0.14550781 0.01287842 -0.20019531 0.078125\n", - " -0.06884766 -0.11962891 -0.23339844 -0.20898438 -0.01733398 0.01068115\n", - " -0.08447266 -0.23242188 -0.04614258 0.1328125 0.07080078 -0.04614258\n", - " -0.078125 0.03344727 0.10693359 0.02355957 0.09814453 -0.13964844\n", - " -0.11816406 0.04345703 -0.203125 -0.02258301 -0.05004883 0.23242188\n", - " -0.22851562 0.01306152 -0.00982666 0.13476562 -0.08789062 -0.06103516\n", - " 0.11083984 0.22363281 -0.09814453 -0.11132812 0.09179688 -0.05859375\n", - " 0.04467773 0.0300293 0.06982422 -0.0035553 -0.08398438 0.125\n", - " 0.0612793 0.06689453 -0.09326172 -0.11767578 0.00221252 0.13769531\n", - " 0.04760742 -0.10058594 0.24902344 0.04589844 0.17285156 0.00723267\n", - " 0.05541992 -0.06103516 -0.03588867 -0.0859375 0.12109375 0.02758789\n", - " -0.00683594 -0.06176758 -0.16796875 -0.02160645 0.07226562 0.07226562\n", - " 0.02307129 0.10888672 -0.02661133 -0.11474609 0.09082031 0.11572266\n", - " 0.16015625 0.11523438 -0.00744629 0.02832031 -0.01635742 -0.09765625\n", - " 0.14648438 0.11181641 0.17089844 0.00653076 0.07568359 0.01055908\n", - " -0.0625 -0.14550781 0.07324219 0.01416016 0.05639648 0.05102539\n", - " -0.06103516 0.10449219 0.078125 -0.02893066 0.00488281 0.0255127\n", - " 0.18066406 0.12597656 0.07910156 -0.11669922 0.04760742 -0.12255859\n", - " -0.23046875 0.02282715 0.11523438 -0.10498047 -0.09716797 0.12988281\n", - " 0.0201416 0.21582031 0.08740234 -0.01708984 -0.12695312 -0.06787109\n", - " -0.04296875 0.07910156 -0.15625 0.04150391 -0.01409912 0.07470703\n", - " -0.03759766 -0.12304688 -0.1171875 0.05957031 0.06884766 0.04516602]\n" - ] - } - ], - "source": [ - "print(X_test[1][2])" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "_________________________________________________________________\n", - "Layer (type) Output Shape Param # \n", - "=================================================================\n", - "conv1d_1 (Conv1D) (None, 512, 32) 28832 \n", - "_________________________________________________________________\n", - "conv1d_2 (Conv1D) (None, 512, 32) 3104 \n", - "_________________________________________________________________\n", - "max_pooling1d_1 (MaxPooling1 (None, 256, 32) 0 \n", - "_________________________________________________________________\n", - "flatten_1 (Flatten) (None, 8192) 0 \n", - "_________________________________________________________________\n", - "dense_1 (Dense) (None, 250) 2048250 \n", - "_________________________________________________________________\n", - "dense_2 (Dense) (None, 3) 753 \n", - "=================================================================\n", - "Total params: 2,080,939\n", - "Trainable params: 2,080,939\n", - "Non-trainable params: 0\n", - "_________________________________________________________________\n", - "None\n", - "Train on 4869 samples, validate on 1218 samples\n", - "Epoch 1/2\n", - "4869/4869 [==============================] - 52s 11ms/step - loss: 0.5755 - acc: 0.6901 - val_loss: 0.5582 - val_acc: 0.7094\n", - "Epoch 2/2\n", - "4869/4869 [==============================] - 41s 8ms/step - loss: 0.4659 - acc: 0.7754 - val_loss: 0.5275 - val_acc: 0.7356\n", - "Accuracy: 73.56%\n" - ] - } - ], - "source": [ - "#Some variables \n", - "\n", - "top_words = 1000\n", - "max_words = 150\n", - "filters = 32 #filter = 1 x KERNEL \n", - "\n", - "# create the model \n", - "model = Sequential()\n", - "\n", - "model.add(Convolution1D(32, kernel_size=3, activation='elu', padding='same',\n", - " input_shape=(max_tweet_length, vector_size)))\n", - "\n", - "model.add(Convolution1D(filters=filters, kernel_size=3, padding='same', activation='relu'))\n", - "model.add(MaxPooling1D(pool_size=2))\n", - "model.add(Flatten())\n", - "model.add(Dense(250, activation='relu'))\n", - "model.add(Dense(3, activation='sigmoid')) \n", - "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) \n", - "print(model.summary())\n", - "\n", - "# Fit the model\n", - "model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=2, batch_size=128,\n", - " verbose=1)\n", - "\n", - "# Final evaluation of the model\n", - "scores = model.evaluate(X_test, Y_test, verbose=0)\n", - "print(\"Accuracy: %.2f%%\" % (scores[1]*100))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [default]", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/Sarah_Google_Word2Vec.ipynb b/examples/Sarah_Google_Word2Vec.ipynb new file mode 100644 index 0000000..faa1fb9 --- /dev/null +++ b/examples/Sarah_Google_Word2Vec.ipynb @@ -0,0 +1,409 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import collections\n", + "import numpy as np\n", + "import pandas as pd\n", + "import gensim\n", + "from keras.models import Sequential\n", + "from keras.layers import Dense\n", + "from keras.layers.convolutional import Convolution1D\n", + "from keras.layers.convolutional import MaxPooling1D\n", + "from keras.layers import Flatten\n", + "from keras.utils import np_utils\n", + "\n", + "from sklearn.preprocessing import LabelEncoder" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Googles Word2Vec \n", + "\n", + "Shows how to use googles pretrained model as inputs to a CNN \n", + "\n", + "### First lets load in the pretrained model and do some data exploration" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Set random seed (for reproducibility)\n", + "np.random.seed(1000)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Set includes 3000000 words\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "//anaconda/envs/py36/lib/python3.6/site-packages/ipykernel_launcher.py:14: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n", + " \n" + ] + } + ], + "source": [ + "#Get word vectors using googles pretrained word2vec \n", + "#takes a minute \n", + "google = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)\n", + "\n", + "#includes some stop words (i.e. the, also, should, but not a, and, of)\n", + "#includes misspellings \n", + "#includes commony paired words (i.e. New_York)\n", + "\n", + "vocab = google.vocab.keys()\n", + "total_vocab = len(vocab)\n", + "print (\"Set includes\", total_vocab, \"words\")\n", + "\n", + "# Copy word vectors and delete Word2Vec model and original corpus to save memory\n", + "X_vecs = google.wv\n", + "#del google #wait to explore model first " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "//anaconda/envs/py36/lib/python3.6/site-packages/ipykernel_launcher.py:2: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n", + " \n" + ] + }, + { + "data": { + "text/plain": [ + "[('global_warming', 0.889603853225708),\n", + " ('Climate_Change', 0.7147639393806458),\n", + " ('Climate', 0.6953692436218262),\n", + " ('Global_warming', 0.661054253578186),\n", + " ('climate', 0.6569506525993347),\n", + " ('greenhouse_gas_emissions', 0.6449477076530457),\n", + " ('greenhouse_gases', 0.6432511806488037),\n", + " ('carbon_emissions', 0.6395047307014465),\n", + " ('Global_Warming', 0.6281516551971436),\n", + " ('reducing_carbon_emissions', 0.6227284669876099)]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from gensim.models import KeyedVectors\n", + "google.wv.most_similar('climate_change')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "del google #save mem " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "300" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Explore the vectors \n", + "X_vecs['hello'].size #check vectors " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Now see how using pretrained vectors improves the model" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Full dataset: 6090\n", + "dataset without NaN: 6087\n" + ] + } + ], + "source": [ + "# Load in data\n", + "# One hot encode sentiment \n", + "\n", + "data = pd.read_csv(\"../wyns/data/tweet_global_warming.csv\", encoding=\"latin\")\n", + "print(\"Full dataset: {}\".format(data.shape[0]))\n", + "data['existence'].fillna(value='ambiguous', inplace = True) #replace NA's in existence with \"ambiguous\"\n", + "data['existence'].replace(('Y', 'N'), ('Yes', 'No'), inplace=True) #rename so encoder doesnt get confused\n", + "data = data.dropna() #now drop NA values\n", + "print(\"dataset without NaN: {}\".format(data.shape[0]))\n", + "X = data.iloc[:,0]\n", + "Y = data.iloc[:,1]\n", + "\n", + "#one hot encoding = dummy vars from categorical var \n", + "#Create a one-hot encoded binary matrix \n", + "#N, Y, Ambig\n", + "#1, 0, 0 \n", + "#0, 1, 0\n", + "#0, 0, 1\n", + "\n", + "#encode class as integers \n", + "encoder = LabelEncoder()\n", + "encoder.fit(Y)\n", + "encoded_Y = encoder.transform(Y) \n", + "\n", + "#convert integers to one hot encoded\n", + "Y = np_utils.to_categorical(encoded_Y)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def read_data(data_file):\n", + " for i, line in enumerate (data_file): \n", + " yield gensim.utils.simple_preprocess (line)\n", + "\n", + "def build_dataset(vocab, n_words):\n", + " \"\"\"Process the top n_words from raw inputs (vocab from read_data) into a dataset.\"\"\"\n", + " count = [['UNK', -1]] #stores when word is found --> UNK = unknown \n", + " count.extend(collections.Counter(vocab).most_common(n_words - 1))\n", + " dictionary = dict()\n", + " for word, _ in count:\n", + " dictionary[word] = len(dictionary)\n", + " token = list() \n", + " unk_count = 0\n", + " for word in vocab: #\n", + " if word in dictionary:\n", + " index = dictionary[word]\n", + " else:\n", + " index = 0 # dictionary['UNK'] assigned to 0 \n", + " unk_count += 1\n", + " token.append(index) #outputs a list of integers that represent words\n", + " count[0][1] = unk_count\n", + " reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) #allows for word lookup by integer\n", + " return token, count, dictionary, reversed_dictionary" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of unique words: 12117\n" + ] + } + ], + "source": [ + "top_words = 20000 #use number higher than expected unique words\n", + "\n", + "tweet_vocab = list(read_data(data['tweet']))\n", + "flat_tweet_vocab = [item for sublist in tweet_vocab for item in sublist]\n", + "token, count, dictionary, reversed_dictionary = build_dataset(flat_tweet_vocab, top_words)\n", + "\n", + "print(\"Number of unique words: {}\".format(len(count))) #correct num of unique words " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Create train and test sets\n", + "# Generate random indexes\n", + "\n", + "test_split = 0.8\n", + "train_size = int(len(X)*test_split)\n", + "test_size = len(X) - train_size\n", + "vector_size = 300\n", + "window_size = 10\n", + "max_tweet_length=512\n", + "\n", + "indexes = set(np.random.choice(len(tweet_vocab), train_size + test_size, replace=False))\n", + "\n", + "X_train = np.zeros((train_size, max_tweet_length, vector_size))\n", + "Y_train = np.zeros((train_size, 3), dtype=np.int32)\n", + "X_test = np.zeros((test_size, max_tweet_length, vector_size))\n", + "Y_test = np.zeros((test_size, 3), dtype=np.int32)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "for i, index in enumerate(indexes):\n", + " for t, token in enumerate(tweet_vocab[index]):\n", + " if t >= max_tweet_length:\n", + " break\n", + " \n", + " if token not in X_vecs:\n", + " continue\n", + " \n", + " if i < train_size:\n", + " X_train[i, t, :] = X_vecs[token]\n", + " else:\n", + " X_test[i - train_size, t, :] = X_vecs[token]\n", + " \n", + " \n", + " if i < train_size:\n", + " Y_train[i, :] = Y[index]\n", + " else:\n", + " Y_test[i - train_size, :] = Y[index]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lets look at how our model performs now! " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "conv1d_1 (Conv1D) (None, 512, 32) 28832 \n", + "_________________________________________________________________\n", + "conv1d_2 (Conv1D) (None, 512, 32) 3104 \n", + "_________________________________________________________________\n", + "max_pooling1d_1 (MaxPooling1 (None, 256, 32) 0 \n", + "_________________________________________________________________\n", + "flatten_1 (Flatten) (None, 8192) 0 \n", + "_________________________________________________________________\n", + "dense_1 (Dense) (None, 250) 2048250 \n", + "_________________________________________________________________\n", + "dense_2 (Dense) (None, 3) 753 \n", + "=================================================================\n", + "Total params: 2,080,939\n", + "Trainable params: 2,080,939\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n", + "None\n", + "Train on 4869 samples, validate on 1218 samples\n", + "Epoch 1/2\n", + "4869/4869 [==============================] - 41s 8ms/step - loss: 0.5755 - acc: 0.6904 - val_loss: 0.5578 - val_acc: 0.7124\n", + "Epoch 2/2\n", + "4869/4869 [==============================] - 41s 8ms/step - loss: 0.4663 - acc: 0.7755 - val_loss: 0.5273 - val_acc: 0.7332\n", + "Accuracy: 73.32%\n" + ] + } + ], + "source": [ + "#Some variables \n", + "\n", + "top_words = 1000\n", + "max_words = 150\n", + "filters = 32 #filter = 1 x KERNEL \n", + "\n", + "# create the model \n", + "model = Sequential()\n", + "\n", + "model.add(Convolution1D(32, kernel_size=3, activation='elu', padding='same',\n", + " input_shape=(max_tweet_length, vector_size)))\n", + "\n", + "model.add(Convolution1D(filters=filters, kernel_size=3, padding='same', activation='relu'))\n", + "model.add(MaxPooling1D(pool_size=2))\n", + "model.add(Flatten())\n", + "model.add(Dense(250, activation='relu'))\n", + "model.add(Dense(3, activation='sigmoid')) \n", + "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) \n", + "print(model.summary())\n", + "\n", + "# Fit the model\n", + "model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=2, batch_size=128,\n", + " verbose=1)\n", + "\n", + "# Final evaluation of the model\n", + "scores = model.evaluate(X_test, Y_test, verbose=0)\n", + "print(\"Accuracy: %.2f%%\" % (scores[1]*100))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/Sarah_Tweepy.ipynb b/examples/Sarah_Tweepy.ipynb new file mode 100644 index 0000000..b461407 --- /dev/null +++ b/examples/Sarah_Tweepy.ipynb @@ -0,0 +1,477 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tweepy for getting tweets quickly \n", + "\n", + "Read the docs: http://tweepy.readthedocs.io/en/v3.5.0/\n", + "\n", + "(couldn't get python-twitter to run on my desktop) \n", + "\n", + "Code below taken from: https://www.karambelkar.info/2015/01/how-to-use-twitters-search-rest-api-most-effectively./ (great read)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import tweepy\n", + "\n", + "# consumer key, consumer secret, access token, access secret. Unique to each person. Read Wes' API notebook for more\n", + "API_KEY = 'IPbYoAbOUR1URWvXWeNwQNnZD'\n", + "API_SECRET = 'goN7XnztVpn6CgkEAAxU9GOVSwbUYwjuFC0ChXdxjWBhRrYZcj'\n", + "access_token = '506759494-rt09qdTZGlGH8WkBDd5M8Vgr6eGbZtlxQVaEH7hA'\n", + "access_token_secret = 'k6tPQuDCnqIf25Ethn6mtZ4pTAoncEufAIy8EVujP2JF2'\n", + "\n", + "auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)\n", + "\n", + "api = tweepy.API(auth, wait_on_rate_limit=True,\n", + " wait_on_rate_limit_notify=True)\n", + "\n", + "if (not api):\n", + " print (\"Can't Authenticate\")\n", + " sys.exit(-1)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import sys\n", + "import jsonpickle\n", + "import os\n", + "import json\n", + "\n", + "searchQuery = 'climate change' # this is what we're searching for\n", + "maxTweets = 2000 # Some arbitrary large number (The while loop will run until Twitter times you out (~15min))\n", + "tweetsPerQry = 100 # max the API permits\n", + "fName = 'tweets.txt' # Stores tweets in text as well as a json file \n", + "\n", + "#below basically prevents pulling duplicate tweets (I think)\n", + "\n", + "# If results from a specific ID onwards are reqd, set since_id to that ID.\n", + "# else default to no lower limit, go as far back as API allows\n", + "sinceId = None\n", + "\n", + "# If results only below a specific ID are, set max_id to that ID.\n", + "# else default to no upper limit, start from the most recent tweet matching the search query.\n", + "max_id = -1" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading max 2000 tweets\n", + "Downloaded 100 tweets\n", + "Downloaded 200 tweets\n", + "Downloaded 300 tweets\n", + "Downloaded 400 tweets\n", + "Downloaded 500 tweets\n", + "Downloaded 600 tweets\n", + "Downloaded 700 tweets\n", + "Downloaded 800 tweets\n", + "Downloaded 900 tweets\n", + "Downloaded 1000 tweets\n", + "Downloaded 1087 tweets\n", + "Downloaded 1177 tweets\n", + "Downloaded 1277 tweets\n", + "Downloaded 1377 tweets\n", + "Downloaded 1474 tweets\n", + "Downloaded 1574 tweets\n", + "Downloaded 1674 tweets\n", + "Downloaded 1769 tweets\n", + "Downloaded 1862 tweets\n", + "Downloaded 1962 tweets\n", + "Downloaded 2062 tweets\n", + "Downloaded 2062 tweets, Saved to tweets.txt\n" + ] + } + ], + "source": [ + "tweetCount = 0\n", + "print(\"Downloading max {0} tweets\".format(maxTweets))\n", + "with open(fName, 'w') as f:\n", + " while tweetCount < maxTweets:\n", + " try:\n", + " if (max_id <= 0):\n", + " if (not sinceId):\n", + " new_tweets = api.search(q=searchQuery, count=tweetsPerQry)\n", + " else:\n", + " new_tweets = api.search(q=searchQuery, count=tweetsPerQry,\n", + " since_id=sinceId)\n", + " else:\n", + " if (not sinceId):\n", + " new_tweets = api.search(q=searchQuery, count=tweetsPerQry,\n", + " max_id=str(max_id - 1))\n", + " else:\n", + " new_tweets = api.search(q=searchQuery, count=tweetsPerQry,\n", + " max_id=str(max_id - 1),\n", + " since_id=sinceId)\n", + " if not new_tweets:\n", + " print(\"No more tweets found\")\n", + " break\n", + " for tweet in new_tweets:\n", + " f.write(jsonpickle.encode(tweet._json, unpicklable=False) +\n", + " '\\n')\n", + " tweetCount += len(new_tweets)\n", + " print(\"Downloaded {0} tweets\".format(tweetCount))\n", + " max_id = new_tweets[-1].id\n", + " except tweepy.TweepError as e:\n", + " # Just exit if any error\n", + " print(\"some error : \" + str(e))\n", + " break\n", + "\n", + "print (\"Downloaded {0} tweets, Saved to {1}\".format(tweetCount, fName))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df = pd.read_json('tweets.txt', lines=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "18 hits have location data in 'place'\n" + ] + }, + { + "data": { + "text/plain": [ + "[{'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[-88.1236583, 42.385769],\n", + " [-88.051803, 42.385769],\n", + " [-88.051803, 42.4480902],\n", + " [-88.1236583, 42.4480902]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'United States',\n", + " 'country_code': 'US',\n", + " 'full_name': 'Lake Villa, IL',\n", + " 'id': '83d2443c72c001a9',\n", + " 'name': 'Lake Villa',\n", + " 'place_type': 'city',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/83d2443c72c001a9.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[-79.9327940509939, 40.59852214703781],\n", + " [-79.9327940509939, 40.59852214703781],\n", + " [-79.9327940509939, 40.59852214703781],\n", + " [-79.9327940509939, 40.59852214703781]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'United States',\n", + " 'country_code': 'US',\n", + " 'full_name': 'Allison Park, PA',\n", + " 'id': '07d9f81d4f885000',\n", + " 'name': 'Allison Park, PA',\n", + " 'place_type': 'poi',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/07d9f81d4f885000.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[105.290653, 20.564095000000002],\n", + " [106.020197, 20.564095000000002],\n", + " [106.020197, 21.385226],\n", + " [105.290653, 21.385226]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'Vietnam',\n", + " 'country_code': 'VN',\n", + " 'full_name': 'Hanoi, Vietnam',\n", + " 'id': '0192af60292eace8',\n", + " 'name': 'Hanoi',\n", + " 'place_type': 'admin',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/0192af60292eace8.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[105.290653, 20.564095000000002],\n", + " [106.020197, 20.564095000000002],\n", + " [106.020197, 21.385226],\n", + " [105.290653, 21.385226]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'Vietnam',\n", + " 'country_code': 'VN',\n", + " 'full_name': 'Hanoi, Vietnam',\n", + " 'id': '0192af60292eace8',\n", + " 'name': 'Hanoi',\n", + " 'place_type': 'admin',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/0192af60292eace8.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[-124.482003, 32.528832],\n", + " [-114.131212, 32.528832],\n", + " [-114.131212, 42.009519],\n", + " [-124.482003, 42.009519]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'United States',\n", + " 'country_code': 'US',\n", + " 'full_name': 'California, USA',\n", + " 'id': 'fbd6d2f5a4e4a15e',\n", + " 'name': 'California',\n", + " 'place_type': 'admin',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/fbd6d2f5a4e4a15e.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[3.9825499, 50.8993057],\n", + " [4.164221, 50.8993057],\n", + " [4.164221, 50.9948827],\n", + " [3.9825499, 50.9948827]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'Belgium',\n", + " 'country_code': 'BE',\n", + " 'full_name': 'Aalst, Belgiƫ',\n", + " 'id': 'b0500318040c21d2',\n", + " 'name': 'Aalst',\n", + " 'place_type': 'city',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/b0500318040c21d2.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[-85.644548, 38.072047],\n", + " [-85.520944, 38.072047],\n", + " [-85.520944, 38.143229],\n", + " [-85.644548, 38.143229]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'United States',\n", + " 'country_code': 'US',\n", + " 'full_name': 'Heritage Creek, KY',\n", + " 'id': '0157e4d7264811f5',\n", + " 'name': 'Heritage Creek',\n", + " 'place_type': 'city',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/0157e4d7264811f5.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[-77.119401, 38.801826],\n", + " [-76.909396, 38.801826],\n", + " [-76.909396, 38.9953797],\n", + " [-77.119401, 38.9953797]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'United States',\n", + " 'country_code': 'US',\n", + " 'full_name': 'Washington, DC',\n", + " 'id': '01fbe706f872cb32',\n", + " 'name': 'Washington',\n", + " 'place_type': 'city',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/01fbe706f872cb32.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[-81.7297293, 28.819687],\n", + " [-81.6317899, 28.819687],\n", + " [-81.6317899, 28.8919998],\n", + " [-81.7297293, 28.8919998]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'United States',\n", + " 'country_code': 'US',\n", + " 'full_name': 'Eustis, FL',\n", + " 'id': 'e1ad088723f72290',\n", + " 'name': 'Eustis',\n", + " 'place_type': 'city',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/e1ad088723f72290.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[-79.429128, 43.798004],\n", + " [-79.170217, 43.798004],\n", + " [-79.170217, 43.963385],\n", + " [-79.429128, 43.963385]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'Canada',\n", + " 'country_code': 'CA',\n", + " 'full_name': 'Markham, Ontario',\n", + " 'id': '7d2673c3623fe492',\n", + " 'name': 'Markham',\n", + " 'place_type': 'city',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/7d2673c3623fe492.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[29.0788593, -25.9971906],\n", + " [29.3804417, -25.9971906],\n", + " [29.3804417, -25.7928753],\n", + " [29.0788593, -25.7928753]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'South Africa',\n", + " 'country_code': 'ZA',\n", + " 'full_name': 'eMalahleni, South Africa',\n", + " 'id': '27b2c17b988dcc52',\n", + " 'name': 'eMalahleni',\n", + " 'place_type': 'city',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/27b2c17b988dcc52.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[-77.053632691935, 38.922719893926285],\n", + " [-77.053632691935, 38.922719893926285],\n", + " [-77.053632691935, 38.922719893926285],\n", + " [-77.053632691935, 38.922719893926285]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'United States',\n", + " 'country_code': 'US',\n", + " 'full_name': 'Omni Shoreham Hotel',\n", + " 'id': '07d9db1114c80001',\n", + " 'name': 'Omni Shoreham Hotel',\n", + " 'place_type': 'poi',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/07d9db1114c80001.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[-90.4181075, 41.696088],\n", + " [-82.122971, 41.696088],\n", + " [-82.122971, 48.306272],\n", + " [-90.4181075, 48.306272]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'United States',\n", + " 'country_code': 'US',\n", + " 'full_name': 'Michigan, USA',\n", + " 'id': '67d92742f1ebf307',\n", + " 'name': 'Michigan',\n", + " 'place_type': 'admin',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/67d92742f1ebf307.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[103.749959507073, 1.2123138339349],\n", + " [103.918426999964, 1.2123138339349],\n", + " [103.918426999964, 1.3687449990256901],\n", + " [103.749959507073, 1.3687449990256901]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'Singapore',\n", + " 'country_code': 'SG',\n", + " 'full_name': 'Central Region, Singapore',\n", + " 'id': '58a4c3a0d54e1400',\n", + " 'name': 'Central Region',\n", + " 'place_type': 'admin',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/58a4c3a0d54e1400.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[-80.20811, 26.080935],\n", + " [-80.0902351, 26.080935],\n", + " [-80.0902351, 26.219801],\n", + " [-80.20811, 26.219801]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'United States',\n", + " 'country_code': 'US',\n", + " 'full_name': 'Fort Lauderdale, FL',\n", + " 'id': '6c686af766d8429c',\n", + " 'name': 'Fort Lauderdale',\n", + " 'place_type': 'city',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/6c686af766d8429c.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[-87.095676, 34.507116],\n", + " [-86.925426, 34.507116],\n", + " [-86.925426, 34.654734],\n", + " [-87.095676, 34.654734]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'United States',\n", + " 'country_code': 'US',\n", + " 'full_name': 'Decatur, AL',\n", + " 'id': '246fb652d518385d',\n", + " 'name': 'Decatur',\n", + " 'place_type': 'city',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/246fb652d518385d.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[-119.5937583, 49.773384],\n", + " [-119.319893, 49.773384],\n", + " [-119.319893, 50.024913],\n", + " [-119.5937583, 50.024913]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'Canada',\n", + " 'country_code': 'CA',\n", + " 'full_name': 'Kelowna, British Columbia',\n", + " 'id': '484de3636fa22d62',\n", + " 'name': 'Kelowna',\n", + " 'place_type': 'city',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/484de3636fa22d62.json'},\n", + " {'attributes': {},\n", + " 'bounding_box': {'coordinates': [[[-109.282237, 41.5595061],\n", + " [-109.192506, 41.5595061],\n", + " [-109.192506, 41.633358],\n", + " [-109.282237, 41.633358]]],\n", + " 'type': 'Polygon'},\n", + " 'contained_within': [],\n", + " 'country': 'United States',\n", + " 'country_code': 'US',\n", + " 'full_name': 'Rock Springs, WY',\n", + " 'id': '5db2aef731e97df0',\n", + " 'name': 'Rock Springs',\n", + " 'place_type': 'city',\n", + " 'url': 'https://api.twitter.com/1.1/geo/id/5db2aef731e97df0.json'}]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "place = df[\"place\"] # Simple preprocess \n", + "place = place.dropna()\n", + "print (len(list(place)), \"hits have location data in 'place'\")\n", + "list(place)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/wyns/get_tweets.py b/wyns/get_tweets.py new file mode 100644 index 0000000..5a66fd8 --- /dev/null +++ b/wyns/get_tweets.py @@ -0,0 +1,69 @@ +import json +import jsonpickle +import os +import sys +import tweepy + +#Use to get tweets in a way that bypasses twitters weird rules# +#Should be able to run on a build node on hyak - havent tested + +# consumer key, consumer secret, access token, access secret. Unique to each person. Read Wes' API notebook for more +API_KEY = 'IPbYoAbOUR1URWvXWeNwQNnZD' +API_SECRET = 'goN7XnztVpn6CgkEAAxU9GOVSwbUYwjuFC0ChXdxjWBhRrYZcj' +access_token = '506759494-rt09qdTZGlGH8WkBDd5M8Vgr6eGbZtlxQVaEH7hA' +access_token_secret = 'k6tPQuDCnqIf25Ethn6mtZ4pTAoncEufAIy8EVujP2JF2' + +auth = tweepy.AppAuthHandler(API_KEY, API_SECRET) +api = tweepy.API(auth, wait_on_rate_limit=True, + wait_on_rate_limit_notify=True) + +if (not api): + print ("Can't Authenticate") + sys.exit(-1) + +searchQuery = 'climate change' +maxTweets = 2000 # Some arbitrary large number (Will run until twitter API times you out) +tweetsPerQry = 100 # max the API permits per query +fName = 'tweets.txt' # Stores tweets in text as well as a json file + +#below basically prevents pulling duplicate tweets (I think) +sinceId = None +max_id = -1 + +tweetCount = 0 +print("Downloading max {0} tweets".format(maxTweets)) + +with open(fName, 'w') as f: + while tweetCount < maxTweets: + try: + if (max_id <= 0): + if (not sinceId): + new_tweets = api.search(q=searchQuery, count=tweetsPerQry) + else: + new_tweets = api.search(q=searchQuery, count=tweetsPerQry, + since_id=sinceId) + else: + if (not sinceId): + new_tweets = api.search(q=searchQuery, count=tweetsPerQry, + max_id=str(max_id - 1)) + else: + new_tweets = api.search(q=searchQuery, count=tweetsPerQry, + max_id=str(max_id - 1), + since_id=sinceId) + if not new_tweets: + print("No more tweets found") + break + for tweet in new_tweets: + f.write(jsonpickle.encode(tweet._json, unpicklable=False) + + '\n') + tweetCount += len(new_tweets) + print("Downloaded {0} tweets".format(tweetCount)) + max_id = new_tweets[-1].id + except tweepy.TweepError as e: + # Just exit if any error + print("some error : " + str(e)) + break + +print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName)) + +