<a href="https://colab.research.google.com/github/pepborrell/LauzHack2019/blob/master/PredictComment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras.models import Model
import tensorflow.keras.preprocessing.image as image
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications import VGG16
#from tensorflow.keras.applications.inception_v3 import preprocess_input, InceptionV3

# Predicting the comment
From an image, we want to predict a comment.

## Loading the data

In [8]:
url = 'https://pastebin.com/raw/SjMqw5Wx'
df = pd.read_csv(url)

df.head()

Unnamed: 0,id,postid,url,upvotes
0,0,dpwday,https://i.redd.it/zhh7urcrtyv31.png,39
1,1,dpwdag,https://i.redd.it/lbanftpptyv31.jpg,1
2,2,dpwda9,https://i.redd.it/jpfds36rtyv31.jpg,21
3,3,dpwd9n,https://i.redd.it/hrcbzuvjtyv31.png,30
4,4,dpwd8h,https://i.redd.it/rb8pod5otyv31.png,31


## Extracting information from image
We use a VGG16 model pretrained with ImageNet data. We don't use the last softmax layer, but reuse the other ones.

In [89]:
modelvgg = VGG16(include_top=True,weights='imagenet')

modelvgg.summary()

modelvgg.layers.pop()
modelvgg = Model(inputs=modelvgg.inputs, outputs=modelvgg.layers[-1].output)

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels.h5
Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     

## Extracting data from each image

In [91]:
import urllib
from io import BytesIO
import requests

npix = 224
target_size = (npix,npix,3)

images = {}
for idx, imageUrl in enumerate(df['url'][:100]):
    print(idx)
    with urllib.request.urlopen(df['url'][0]) as url:
        img = image.load_img(BytesIO(url.read()), target_size=target_size)
    # Convert PIL image to numpy array of 3-dimensions
    x = image.img_to_array(img)
    nimage = preprocess_input(x)
    
    y_pred = modelvgg.predict(nimage.reshape( (1,) + nimage.shape[:3]))
    images[idx] = y_pred.flatten()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


## Text cleaning and preprocessing


In [0]:
def cleanWords(text):
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    # Convert to lowercase
    text = [word.lower() for word in text]
    # remove punctuation from each token
    text = [word.translate(table) for word in text]
    # remove hanging 's' and 'a'
    text = [word for word in text if len(word)>1]
    # remove tokens with numbers in them
    text = [word for word in text if word.isalpha()]
    return text

def cleanComment(text):
    text.split(' ')
    text = cleanWords(text)
    return text

def addStartEndSeq(words):
    return ['startseq'] + words + ['endseq']

In [109]:
df.apply(lambda x : cleanComment(x['comments']))

df.apply(lambda x : addStartEndSeq(x['comments']))


all_text = []
for text in df['comments']:
    all_text.append(text)

word_freq = {}
for word in all_text:
    if word_freq[word]:
        word_freq[word] += 1
    else:
        word_freq[word] = 1

KeyError: ignored