In [1]:
import pandas as pd
import json
import requests
from glob import glob
from bs4 import BeautifulSoup
from nameparser import HumanName
import unicodedata
import re

In [2]:
#
# Store and load spreadsheet
#

response = requests.get('https://docs.google.com/spreadsheets/d/1TaFZwPyKXRcLBOmhbpRBFD2f2N3u5cOeuViyQkkxZ3A/export?format=csv')
with open('spreadsheet.csv', 'wb') as f:
    f.write(response.content)
df_links = pd.read_csv('spreadsheet.csv')
#  usecols=['cardId', 'Name', 'parameter', 'cardSubtitle', 'cardfilter4Tri',
#       'cardfilter4Display', 'Image', 'Crédit image', 'cardfilter0Tri', 'cardfilter5Display', 'cardSection5Text',
#'cardSection6Text', 'New']

# On elimine une ligne d'infos
df_links = df_links[df_links['Name'] == df_links['Name']].copy()

df_links['cardId'] = df_links['cardId'].astype(int)
df_links['isNew'] = df_links['New'].apply(lambda x: 'is-new' if x == 'New' else 'not-new')

df_links.shape

(102, 21)

In [3]:
#
# Get stories
#

files = glob('stories/*.html')
def getContents(file):
    with open(file, 'r') as f:
        doc = BeautifulSoup(f.read(), 'html.parser')
    meta = {}
    for tr in doc.select('table tr'):
        meta[tr.find('th').text.replace(':', '').strip()] = tr.find('td').text.strip()
    meta['Paragraphs'] = [i.text for i in doc.select('p') if i != '']
    return meta

cols = ['Titre', 'Chapeau', 'Pour Correction (LA)', 'Paragraphs']
df = pd.DataFrame(columns=cols)

for file in files:
    data = getContents(file)
    story = {}
    for key in cols:
        if key in data.keys():
            story[key] = data[key]
        else:
            print('WARNING: no', key, 'for story', data['Titre'])
            story[key] = ''
    df = df.append(pd.DataFrame([story], columns=cols))
print('Dataframe contains', len(df), 'stories.')

def getText(paragraph):
    p = [i.strip() for i in paragraph[1:-1]]
    return "\n".join(p).strip()

df['Text'] = df['Paragraphs'].apply(getText)
df['cardSubtitle'] = df['Chapeau']
del df['Paragraphs']

Dataframe contains 101 stories.


In [4]:
def getName(titre):
    name = re.search('\d (27.09 ){0,1}([a-zA-ZÀ-ÿ’\- ]*)', titre)
    return name.group(2)

def getParameter(name):
    nfkd_form = unicodedata.normalize('NFKD', name)
    name = u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
    return name.strip().replace(' ', '-').lower()

def getLastName(name):
    return HumanName(name).last

df['cardTitre'] = df['Titre'].apply(getName)
df['parameter'] = df['cardTitre'].apply(getParameter)
df['Last Name'] = df['cardTitre'].apply(getLastName)
df.sort_values('Last Name', inplace=True)

In [5]:
df['parameter'] = df['parameter'].apply(lambda x: x.replace('valentina-s', 'valentina-velandia'))

In [6]:
df[df['cardTitre'].str.contains('Valentina')]

Unnamed: 0,Titre,Chapeau,Pour Correction (LA),Text,cardSubtitle,cardTitre,parameter,Last Name
0,Digital Shapers Portrait 93. 27.09 Valentina S...,"Cofondatrice de Capacity Zurich, Zurich",Corrigé,"Née en Colombie, Valentina S. Velandia a elle-...","Cofondatrice de Capacity Zurich, Zurich",Valentina S,valentina-velandia,S


In [7]:
df[-df['Titre'].str.contains('Portrait|Portarit', flags=re.IGNORECASE)]

Unnamed: 0,Titre,Chapeau,Pour Correction (LA),Text,cardSubtitle,cardTitre,parameter,Last Name
0,Digital Shapers Article 1. 27.09 Robots pour e...,"Le quotidien est hautement numérique, mais pas...",Corrigé,Vous avez des enfants ou petits-enfants en âge...,"Le quotidien est hautement numérique, mais pas...",Robots pour enfants,robots-pour-enfants,enfants


In [8]:
df = df[df['Titre'].str.contains('Portrait|Portarit', flags=re.IGNORECASE)].copy()

In [9]:
dfm = df_links.merge(df, on='parameter', how='outer', suffixes=('', '_'))

In [10]:
dfm['cardTitre'] = dfm['Name']

In [11]:
dfm[dfm['Age'] == dfm['Age']].shape

(100, 28)

In [12]:
dfm['Titre'].fillna('', inplace=True)

In [13]:
# «portrait»: manquant pour torrellas
dfm[-dfm['Titre'].str.contains('Portrait|Portarit', flags=re.IGNORECASE)]

Unnamed: 0,cardId,Name,parameter,cardSubtitle,cardFilter4Tri,cardFilter4Display,cardFilter4DisplayBackup,Image,Crédit image,New,...,cardSection6Text,URL,isNew,Titre,Chapeau,Pour Correction (LA),Text,cardSubtitle_,cardTitre,Last Name
100,200,Editorial,editorial,Digital first,,,,,,,...,,,not-new,,,,,,Editorial,
101,300,Publireportage,publireportage,Un regard sur l’avenir,,,,,,,...,,,not-new,,,,,,Publireportage,


In [14]:
dfm['cardImage'] = 'img/2018/' + dfm['Image']

In [15]:
def cleanCredit(credit):
    if not isinstance(credit, str):
        return
    credit = re.sub('\?|©', '', credit)
    credit.replace('|', '/')
    return credit.strip()
dfm['copyright'] = dfm['Crédit image'].apply(cleanCredit)

In [16]:
dfm['Text'].fillna('', inplace=True)

In [17]:
def cutText(text):
    if text.count('\n') > 2:
        p = text.split('\n')
        return "\n".join(p[:2]).replace('\n', '</p><p>') + ' <a class="readMore" href="#more">Plus…</a>'
    else:
        return text

def getCollapsedText(text):
    if text.count('\n') > 2:
        p = text.split('\n')
        return "\n".join(p[2:])
    else:
        return ''
    
dfm['cardSection1Text'] = dfm['Text'].apply(cutText)
dfm['cardSection2Text'] = dfm['Text'].apply(getCollapsedText)
dfm['cardInfo1'] = dfm['cardSubtitle']
dfm['cardInfo2'] = dfm['Age']
dfm['cardFilter2Tri'] = dfm['cardFilter2Tri'].apply(lambda x: x.replace('expat', 'f-expat') if x == x else x)

In [18]:
# story links
def getStoryLink(cell):
    try:
        title, anchor = cell.split('|')
    except:
        return ''
    return anchor

def getStoryTitle(cell):
    try:
        title, anchor = cell.split('|')
    except:
        return ''
    return title

In [19]:
dfm['cardSection4Text'] = dfm['cardSection5Text'].apply(getStoryTitle)
dfm['cardSection5Text'] = dfm['cardSection5Text'].apply(getStoryLink)

In [20]:
chapitres = {
    'f-corporates': 'Grandes entreprises',
    'f-creatives': 'Créatifs',
    'f-do-gooders': 'Bienfaiteurs', 
    'f-enabler': 'Facilitateurs',
    'f-expats': 'Expats',
    'f-investors': 'Investisseurs',
    'f-researchers': 'Chercheurs',
    'f-serial-entrepreneurs': 'Serial entrepreneurs',
    'f-tech-leaders': 'Leaders',
    'f-unicorn-raisers': 'Éleveurs de licornes'
}

def getChapter(cardfilter):
    if cardfilter != cardfilter:
        return ''
    return chapitres[cardfilter]
#dfm['cardFilter4Display'] = dfm['cardFilter4Tri'].apply(getChapter)

In [21]:
with open('editorial.html', 'r') as f:
    edito = f.read()
for i, row in dfm[dfm['cardId'] == 200].iterrows():
    dfm.at[i,'cardTitre'] = 'Digital first'
    dfm.at[i, 'cardSection1Text'] = edito
    dfm.at[i, 'parameter'] = None    

In [22]:
with open('publireportage_audi.html', 'r') as f:
    audi = f.read()
for i, row in dfm[dfm['cardId'] == 300].iterrows():
    dfm.at[i,'cardTitre'] = 'Un regard sur l’avenir'
    dfm.at[i, 'cardSection1Text'] = audi
    dfm.at[i, 'parameter'] = None

In [23]:
dfm['cardFilter4DisplaySingular'] = dfm['cardFilter4Display'].apply(lambda x: x.lower() if x == x else x) #x.strip('s'))

In [39]:
data = json.load(open('data.json'))

columns = list(data['cards'][0].keys())
columns.append('parameter')
columns.append('cardSection2Text')
columns.append('cardFilter0Tri')
columns.append('isNew')
columns.append('copyright')
columns.append('cardFilter4DisplaySingular')

dfx = dfm.copy()
for col in dfx.columns:
    if col not in columns:
        del dfx[col]

emptyColumns = []
for col in columns:
    if col not in dfm.columns:
        emptyColumns.append(col)
        dfm[col] = '' #fakedata[col]

In [51]:
dfx['cardFilter2Tri'] = dfx['cardFilter2Tri'].apply(lambda x: x.replace('f-f-ex', 'f-ex') if x == x else x)

In [40]:
dfx['cardFilter0Tri'].value_counts()

no-card    2
Name: cardFilter0Tri, dtype: int64

In [45]:
dfx['cardTeaserImage'] = dfx['cardImage'].apply(lambda x: x.replace('2018', '2018/teaser') if x == x else x)

In [46]:
#
# export!!
#

jsondata = dfx.to_json(force_ascii=False, orient='records')

data['cards'] = json.loads(jsondata)

with open('/Users/rnp/Sites/digital-shapers/json/data.json', 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(json.dumps(data, ensure_ascii=False, indent=4, sort_keys=True))
    
# backup
dfx.to_csv('json_export.csv')

# End

In [28]:
dfm.groupby('cardFilter4Display')['cardFilter5Display'].value_counts()

cardFilter4Display   cardFilter5Display    
Bienfaiteurs         Recherche                  3
                     Développement durable      2
                     Services informatiques     2
                     Blockchain                 1
                     Blockchain?                1
                     Finance                    1
Chercheurs           Recherche                 10
Créatifs             Recherche                  6
                     Art                        2
                     Développement durable      1
                     Services informatiques     1
Entrepreneurs        Services informatiques     7
                     Finance                    6
                     Recherche                  3
                     Electronique               2
                     Santé                      2
                     Blockchain                 1
                     Développement durable      1
                     Marketing?                 1
      

In [30]:
dfm['cardFilter4Display'].value_counts()


Entrepreneurs          25
Facilitateurs          22
Grandes entreprises    13
Chercheurs             10
Bienfaiteurs           10
Leaders                10
Créatifs               10
Name: cardFilter4Display, dtype: int64

In [31]:
dfm['cardFilter5Display'].value_counts()

Services informatiques    26
Recherche                 26
Finance                   25
Développement durable      6
Blockchain                 2
Négoce                     2
Electronique               2
Art                        2
Santé                      2
Robotique                  1
Marketing?                 1
Agroalimentaire            1
Blockchain?                1
Transport                  1
Incubateurs                1
????                       1
Name: cardFilter5Display, dtype: int64

In [32]:
d = list(set(dfm['cardFilter4Display'].tolist()))

In [33]:
d = [i for i in d if i == i]

In [34]:
v = []
for i in d:
    if i == i:
        v.append(dfm[dfm['cardFilter4Display'] == i]['cardFilter4Tri'].values[0])
v

['f-leaders',
 'f-grandes-entreprises',
 'f-bienfaiteurs',
 'f-chercheurs',
 'f-entrepreneurs',
 'f-créatifs',
 'f-facilitateurs']

In [35]:
for i, j in zip(v, d):
    print('<div class="btn" data-filter=".{}">{}</div>'.format(i, j))

<div class="btn" data-filter=".f-leaders">Leaders</div>
<div class="btn" data-filter=".f-grandes-entreprises">Grandes entreprises</div>
<div class="btn" data-filter=".f-bienfaiteurs">Bienfaiteurs</div>
<div class="btn" data-filter=".f-chercheurs">Chercheurs</div>
<div class="btn" data-filter=".f-entrepreneurs">Entrepreneurs</div>
<div class="btn" data-filter=".f-créatifs">Créatifs</div>
<div class="btn" data-filter=".f-facilitateurs">Facilitateurs</div>
