# Gateway to Research

This notebook loads and shows the Gateway to Research data

Check this [repo](https://github.com/nestauk/gtr_data_processing) for additional information about the GtR data.

# Preamble

In [None]:
%run notebook_preamble.ipy

In [None]:
# Functions etc here
import re
from pylab import *
from plotnine import * 
import geopandas as gpd
from string import punctuation
from pyproj import Proj

def flatten_list(a_list):
    return([x for el in a_list for x in el])

# Analysis of pre-processed data

In [None]:
#Reads in the data that has been processed for university effects

    
    
my_path = 'filepath/060819_gtr_creative_sect.csv'


gtr = pd.read_csv(my_path,compression='zip',na_values='[]').iloc[:,1:]


In [None]:
list(gtr)

In [None]:
gtr.head(n=5)

In [None]:
gtr['creative_sector'].value_counts()

# Creates a flag for all the categories and individual flags to handle the individual components i.e.

 a=['Museums, galleries and libraries', 'Film, TV, video, radio and photography' 'Design','Architecture','Publishing' , 'Advertising and marketing','Crafts', 'IT,software and computer services', 'Music, performing and visual arts']

In [None]:
creative_industry=['Museums, galleries and libraries', 'Film, TV, video, radio and photography' 'Design','Architecture','Publishing' , 'Advertising and marketing','Crafts', 'IT, software and computer services', 'Music, performing and visual arts']


#General creative function
def creativesearch(x):
 regex = re.compile("|".join(word for word in creative_industry), re.IGNORECASE)
 if regex.search(x):
    return 1 #This is done as you can't subset dataset with None and not equals operator
 else:
   return 0

#Domain function
def domain(x,y): # y is the word x is the column it is applied to
 regex = re.compile(y, re.IGNORECASE)
 if regex.search(x):
    return 1 #This is done as you can't subset dataset with None and not equals operator
 else:
   return 0


#Set as string
gtr[['creative_sector']]=gtr[['creative_sector']].astype(str)


#Apply functions
gtr['creative_flag']=gtr[['creative_sector']].applymap(creativesearch)

#Creates sector flags for each category
for elem in creative_industry:
 gtr[elem]=gtr[['creative_sector']].applymap(lambda x:domain(x, elem))
 
gtr.head(n=6)

# Does the count of ai by the different creative sectors

In [None]:
#Sums the dataframe by AI status
countby_ai_status=gtr.groupby(['ai_mod']).sum()

#Drops most of the variables, except the ones we want
countby_ai_status=countby_ai_status[creative_industry+['creative_flag']]

#Pastes to clipboard
countby_ai_status.to_clipboard()

In [None]:
ax=countby_ai_status.loc[True , : ].plot.bar(figsize=(10,5))
ax.set_ylabel('Number of AI related projects')

In [None]:
#view_the_abstracts=gtr['abstract'][(gtr['ai_mod']==True) &  (gtr['creative_flag']==1)]


# Looks at how the number of projects is changing over time

AI projects

In [None]:


(ggplot(gtr[gtr['ai_mod']==True],aes(x='year',group='ai_mod',color='ai_mod'))+
  geom_freqpoly(binwidth = 1, show_legend=False) +xlab("Year")+ylab("Number of AI projects")+xlim(2007,2018)+ylim(0,300))

Creative projects

In [None]:
#Was creative_flag_semantic
(ggplot(gtr[gtr['creative_flag']==True],aes(x='year',group='creative_flag',color='creative_flag'))+
  geom_freqpoly(binwidth = 1, show_legend=False) +xlab("Year")+ylab("Number of Creative projects")+xlim(2007,2018)+ylim(0,400))

AI and Creative projects

In [None]:
(ggplot(gtr[(gtr['creative_flag']==True) & (gtr['ai_mod']==1)],aes(x='year'))+
  geom_freqpoly(binwidth = 1, show_legend=False) +xlab("Year")+ylab("Number of AI and Creative projects")+xlim(2007,2018)+ylim(0,100))

# Spatial analysis

In [None]:
#Loads data

stem="filepath"

files="Local_Authority_Districts_December_2017_Super_Generalised_Clipped_Boundaries_in_United_Kingdom_WGS84.shp"

#proje="+proj=utm +zone=33 +ellps=WGS84 +datum=WGS84 +units=m +no_defs"

UK_lad=gpd.read_file(stem+files)

UK_lad.crs

#Sets the projection

#UK_lad = UK_lad.to_crs({'init' :'epsg:25832'})


Note: Issue in the projection to resolve

In [None]:
#Check it's loaded
ax=UK_lad.plot( figsize=(5, 5))
ax.set_title('')
ax.axis('off')

 Does spatial counts of local authorities

In [None]:
#Sorts out the multiple local authorities

#subsets the data so ai and creative only
creative_ai=gtr[(gtr['ai_mod']==True) &  (gtr['creative_flag']==1)]

creative_ai.shape



def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

creative_ai['all_lad_code']=creative_ai['all_lad_code'].astype(str)

creative_ai['all_lad_code']=creative_ai['all_lad_code'].map(strip_punctuation)


#Convert the dataframe of lists into one single list

#concatenate the strings
a=''
    
for elem in creative_ai['all_lad_code']:
     a=a+' '+str(elem)
# split them to get a list
a=a.split()


Does a table of the number of local authorities in list

In [None]:
from collections import Counter
#Count the elements of the dataframe
    
d=Counter(a)

#Convert the counter to a dataframe
ai_creative_count = pd.DataFrame.from_dict(d, orient='index').reset_index()


#sort out the column names
ai_creative_count.rename(columns={'index':'la_code', 0:'project count'}, inplace=True)

ai_creative_count.head(n=5)

In [None]:
#Merges the two datasets
UK_lad=UK_lad.merge(ai_creative_count, how='left', left_on='lad17cd'  , right_on='la_code')

UK_lad.tail(n=5)


# Local Authority map for all participating organisations

In [None]:
UK_lad['project count']=UK_lad['project count'].fillna(0)


ax=UK_lad.plot(column='project count', cmap='cool', figsize=(15,15))
ax.set_title('')
ax.axis('off')

# Table of local authorities count for all participating organisation

In [None]:
tabs=UK_lad[['lad17nm' ,'project count']].sort_values(by='project count', ascending=False)

#set as integer
tabs['project count']=tabs['project count'].astype(int)

#renames the columns
tabs.rename(columns={'lad17nm':'local authority', 'project count':'project partner count'}, inplace=True)

#drops the index
tabs=tabs.reset_index(drop=True)

tabs.head(n=12)

In [None]:
tabs.to_clipboard()

# Topic analysis of the data at the intersection of AI and creative

In [None]:
#import sklearn
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn import metrics  #for the cluster metrics like silhoute score
from sklearn import manifold #for TSNE
import numpy as np
import re
from string import punctuation
from time import time



In [None]:
#Select the data, admittedly a small sample

df=gtr[(gtr['ai_mod']==True) &  (gtr['creative_flag']==1)  ]

column_names = ['abstract']

df[column_names].shape



Text cleaning

In [None]:


#Sets to lower case
df[column_names] = df[column_names].applymap(lambda x: x.lower())


#Removes the utf characters

def utfremove(x):   #Need the \ to escape the "
   return re.sub(r"u'|u\"", "", x)

df[column_names] = df[column_names].applymap(utfremove)

#Removes new line characters
def nlremove(x):   #Need the \ to escape the "
   return re.sub(r"\\n", "", x)


#Removes hyperlinks

def htmlremove(x):
  return re.sub(r"http\S+", "", x)

df[column_names] = df[column_names].applymap(htmlremove)


#Removes punctuation

def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

df[column_names] = df[column_names].applymap(strip_punctuation)



#Removes numbers

def numremove(x):
    return  re.sub("\d+", "", x)

df[column_names] = df[column_names].applymap(numremove)



#Removes stopwords
def stopremove(x):
 from nltk.corpus import stopwords
 stop = stopwords.words('english')
 querywords = x.split()
  

 stopwords= list(stop_words.ENGLISH_STOP_WORDS)
 resultwords  = [word for word in querywords if word.lower() not in stopwords]
 result = ' '.join(resultwords) 
 return(result)


#Removes the stop words
df[column_names] = df[column_names].applymap(stopremove)



print(df.shape)





Document term matrix and tfidf

In [None]:
# The tfidf stage

#Maximum number of features
n_features=200

x=df['abstract']


# TfidfVectorizer converts a collection of raw documents to a matrix of TF-IDF features.
#max_df gives the highest proportion of documents that words are allowed to appear in
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=5, max_features=n_features, stop_words='english',ngram_range=(1,2))
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(x)
print("done in %0.3fs." % (time() - t0))

#Converts the tfidf to a data frame which can be viewed
tfidfdata=pd.DataFrame(tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names())

# Use tf (raw term count) features
tf_vectorizer = CountVectorizer(max_df=0.8, min_df=5,  max_features=n_features,stop_words='english', ngram_range=(1,2))
t0 = time()
tf = tf_vectorizer.fit_transform(x)
print("done in %0.3fs." % (time() - t0))
print()


In [None]:
#import print_function

from time import time

n_samples = 2000
n_features = 1000
n_top_words = 10


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]]) #argsort() returns the indices that sort an array
        print(message)
 



print("Fitting LDA models with tf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features))

#Notes this needs python 3 to work
lda = LatentDirichletAllocation(n_components=3, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

t0 = time()

#Fits the model to the term inverse document frequency matrix
lda.fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names() #Gets the names of the words the tern frequency is defined over
print_top_words(lda, tf_feature_names, n_top_words)