In [31]:
import cnfg
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm.exc import NoResultFound
from instagram.client import InstagramAPI 
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
from db_setup import Base, User, Media, Follower
from models import get_user_id, commit_to_db, session

In [6]:
config = cnfg.load(".instagram_config")
api = InstagramAPI(client_id=config['CLIENT_ID'],client_secret=config['CLIENT_SECRET'])

In [7]:
key_id = get_user_id('jstnstwrt')

In [9]:
list_of_followers = session.query(Follower).filter_by(user_id=key_id)

In [12]:
def store_user_location(user_id):
    try:
        newUser = session.query(User).filter_by(user_id=user_id).one()
        media =  session.query(Media).filter_by(user_id=user_id).filter(Media.latitude != None).limit(1)
        try:
            newUser.latitude = str(media[0].latitude)
            newUser.longitude = str(media[0].longitude)
            commit_to_db(newUser)
        except IndexError:
            print 'User has no geo tags'
    except NoResultFound:
        print 'User is private'

In [8]:
sql_query = 'select user.id,user.user_id,bio,num_followers,num_following,num_posts,latitude,longitude from user join follower on user.user_id = follower.follower_id where follower.user_id=332324252'

In [9]:
df = pd.read_sql_query(sql_query,con='sqlite:///instagram.sqlite')

In [10]:
df.shape

(491, 8)

In [7]:
df.bio[:5]

0    Earth: you don't have to be crazy to live here...
1                                                     
2    Hand-crafted wedding dresses born from happine...
3                                                   Oi
4    Freelance Fashion Portrait Photographer in Lon...
Name: bio, dtype: object

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from string import punctuation, digits, ascii_lowercase

In [11]:
escapes = ''.join([chr(char) for char in range(1, 32)])
removeables = escapes + digits 
stops = [str(word) for word in stopwords.words('english')] + list(ascii_lowercase)
sno = nltk.stem.SnowballStemmer('english')

In [25]:
def parse_text(text, stem=False):
    ''' This function takes a review string and removes all escape sequences,
        digits, punctuation, http links, and stop words. Furthermore, every
        word in the string will be stemmed using nltk's snowball stemmer.
        Every word is also transformed to be lowercase.'''
    
    text = re.sub(r"http\S+", " ", text)
    regex = re.compile('[%s]' % re.escape(punctuation))
    text = regex.sub(' ', text)
    text = text.translate({ord(c): None for c in removeables})
#     text = text.decode('utf8')
    if stem == True:
        text = ' '.join([sno.stem(word.lower()) for word in text.split() if word.lower() not in set(stops)])
    else:
        text = ' '.join([word.lower() for word in text.split() if word.lower() not in set(stops)])
    return text

In [26]:
df['bio_parsed']=df.bio.apply(parse_text)

In [28]:
df.bio_parsed[:10]

0              earth crazy live helps columbia records
1                                                     
2    hand crafted wedding dresses born happiness la...
3                                                   oi
4    freelance fashion portrait photographer london...
5    producer engineer credits joell ortiz troy ave...
6    sharing rawrrrchell love passion food follow l...
7                                                     
8    friend businessman investor living nyc founder...
9                                                     
Name: bio_parsed, dtype: object

In [29]:
corpus = df.bio_parsed.tolist()

In [34]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(corpus)
words = count_vect.get_feature_names()
counts = X_counts.toarray()

In [36]:
bios = pd.DataFrame(counts)
bios.columns = words

In [37]:
bios.shape

(491, 1559)