In [7]:
# !pip install tika
from tika import parser
import re
import numpy as np

In [8]:
def index_containing_substring(the_list, substring):
    """https://stackoverflow.com/questions/2170900/get-first-list-index-containing-sub-string"""
    for i, s in enumerate(the_list):
        if substring in s:
              return i
    return -1

In [9]:

def tip_table_of_contents_to_list(fname):

    raw = parser.from_file(fname)
    papers = [re.sub(r'[\s]{2,}','',re.sub('[\.]+','',line) ) for line in raw['content'].split('\n') if line]
    papers = papers[papers.index('PAPERS')+1:index_containing_substring(papers,"Available online")]
    lines_that_have_numbers = [len(re.findall(r'[\d]+',p))>0 for p in papers]
    
    start = 0
    new_papers = []
    for i in range(len(papers)):
        if lines_that_have_numbers[i]:
            new_papers.append(' '.join(papers[start:i+1]))
            start = i+1
            
    return new_papers

In [11]:
papers_tip2021 = tip_table_of_contents_to_list('TIP2021/Table_of_contents.pdf') + \
        tip_table_of_contents_to_list('TIP2021/Table_of_contents (1).pdf')

2022-03-23 09:49:31,813 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.24/tika-server-1.24.jar to /tmp/tika-server.jar.
2022-03-23 09:49:46,032 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.24/tika-server-1.24.jar.md5 to /tmp/tika-server.jar.md5.
2022-03-23 09:49:47,246 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


In [12]:
len(papers_tip2021)

1482

In [13]:
papers_tip2020 = tip_table_of_contents_to_list('TIP2020/Table_of_contents.pdf') + \
        tip_table_of_contents_to_list('TIP2020/Table_of_contents (1).pdf')

In [14]:
len(papers_tip2020)

1512

In [40]:
np.random.choice(papers_tip2020,10).tolist()

['The Fourier-Argand Representation: An Optimal Basis of Steerable PatternsT Zhao and T Blu 6357',
 'MV-GNN: Multi-View Graph Neural Network for Compression Artifacts ReductionX He, Q Liu, and Y Yang 6829',
 '2D Quaternion Sparse Discriminant AnalysisX Xiao, Y Chen, Y-J Gong, and Y Zhou 2271',
 'Deep Heterogeneous Hashing for Face Video RetrievalS Qiao, R Wang, S Shan, and X Chen 1299',
 'Needles in a Haystack: Tracking City-Scale Moving Vehicles From Continuously Moving Satellite W Ao, Y Fu, X Hou, and F Xu 1944',
 'Blind Deblurring of Text Images Using a Text-Specific Hybrid DictionaryH Lee, C Jung, and C Kim 710',
 'Deep Learning-Based Picture-Wise Just Noticeable Distortion Prediction Model for Image Compression H Liu, Y Zhang, H Zhang, C Fan, S Kwong, C-C J Kuo, and X Fan 641',
 'Perceptual Temporal Incoherence-Guided Stereo Video Retargeting B Li, C-W Lin, S Liu, T Huang, W Gao, and C-C J Kuo 5767',
 'Visual Saliency via Embedding Hierarchical Knowledge in a Deep Neural Network F

In [42]:
np.random.choice(papers_tip2021,100).tolist()

['Is Context-Aware CNN Ready for the Surroundings? Panoramic Semantic Segmentation in the Wild K Yang, X Hu, and R Stiefelhagen 1866',
 'Sparse Learning-Based Correlation Filter for Robust TrackingW Zhang, L Jiao, Y Li, and J Liu 878',
 'Multi-Stage Feature Fusion Network for Video Super-Resolution H Song, W Xu, D Liu, B Liu, Q Liu, and D N Metaxas 2923',
 'Fs-DSM: Few-Shot Diagram-Sentence Matching via Cross-Modal Attention Graph Model X Hu, L Zhang, J Liu, Q Zheng, and J Zhou 8102',
 'Deconvolved Image Restoration From Auto-CorrelationsD Ancora and A Bassi 1332',
 'Searching Multi-Rate and Multi-Modal Temporal Enhanced Networks for Gesture Recognition Z Yu, B Zhou, J Wan, P Wang, H Chen, X Liu, S Z Li, and G Zhao 5626',
 'PRA-Net: Point Relation-Aware Network for 3D Point Cloud Analysis',
 '(Contents Continued on Page xxxvi) (Contents Continued from Page xxxv) Progressive Diversified Augmentation for General Robustness of DNNs: A Unified Approach H Yu, A Liu, G Li, J Yang, and C Zhan

In [15]:
np.random.choice(papers_tip2021,7).tolist()

['Learning Complete and Discriminative Direction Pattern for Robust Palmprint RecognitionS Zhao and B Zhang 1001',
 'Bi-Directional Exponential Angular Triplet Loss for RGB-Infrared Person Re-Identification H Ye, H Liu, F Meng, and X Li 1583',
 'Gradient-Based Feature Extraction From Raw Bayer Pattern ImagesW Zhou, L Zhang, S Gao, and X Lou 5122',
 'Image and Video Interpretation and Understanding Block Proposal Neural Architecture SearchJ Liu, S Zhou, Y Wu, K Chen, W Ouyang, and D Xu 15',
 'Tracking-by-Counting: Using Network Flows on Crowd Density Maps for Tracking Multiple Targets W Ren, X Wang, J Tian, Y Tang, and A B Chan 1439',
 'Fast Local Spatial Verification for Feature-Agnostic Large-Scale Image Retrieval J Brogan, A Bharati, D Moreira, A Rocha, K W Bowyer, P J Flynn, and W J Scheirer 6892',
 'Superpixels With Content-Adaptive CriteriaY Yuan, W Zhang, H Yu, and Z Zhu 7702']

In [18]:
np.random.choice(papers_tip2021,1).tolist()

['SLOAN: Scale-Adaptive Orientation Attention Network for Scene Text Recognition P Dai, H Zhang, and X Cao 1687']

In [1]:
# !pip install pdf2image
from pdf2image import convert_from_path
import os
import numpy as np

In [2]:
def select_random_pages_from_papers(src_folder,dst_folder,dpi = 93):
    os.makedirs(dst_folder, exist_ok=True)
    files = [f for f in os.listdir(src_folder) if f.endswith('.pdf')]
    for f in files:
        path = os.path.join(src_folder,f)
        pages = convert_from_path(path, dpi)
        i = np.random.choice(len(pages),1)[0]
        page = pages[i]
        page.save(os.path.join(dst_folder,f"{os.path.splitext(f)[0]}_{i+1}.png"), 'PNG')

In [3]:
# select_random_pages_from_papers('TIP2020/papers','TIP2020/pages')

In [4]:
select_random_pages_from_papers('TIP2021/papers','TIP2021/pages')